Author: Mengna Zhang

Last updated on: 2025-09-15

Set Path

## directory: can extend to the main CNT folder
directory <- "/Users/"

## your own directory
my_directory <- paste0(directory, "mengnazhang/Desktop/")

## set PRADI path (the folder where PRADI raw files located)
pradi_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Raw/")

## output path
out_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Cleaned/")

## script path
script_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Scripts/")

## revised DD path
revisedDDpath <- paste0(out_directory,"colnamesPerSubdata.xlsx")


Load Helper Scripts

source("/Users/mengnazhang/Desktop/ADSP_DataPrep/dataPrep2025/helperScripts_PRADI.R")
# source(paste0(script_directory,"helperScripts_WRAP.R"))


Load Packages

require(dplyr)
require(readxl)
require(openxlsx)
require(stringr)
require(tidyr)
require(lubridate)
require(ggplot2)
`%!in%` <- Negate(`%in%`)



Sub Files Inspection

Load All Subfiles

file_list <- paste0(out_directory,"filelist.txt")

## read all lines (file names) from the file
file_names <- readLines(file_list)

## loop over each file name
for (fname in file_names) {
  ## Extract the clean name by removing prefix and suffix
  clean_name <- sub("^PRADI_", "", fname)
  clean_name <- sub("_05122025\\.xlsx$", "", clean_name)
  
  ## read the Excel file and convert to data.frame
  data <- as.data.frame(read_excel(paste0(pradi_directory, fname),sheet = "Export Worksheet"))
  
  ## assign to a variable with the clean name in the global environment
  assign(clean_name, data, envir = .GlobalEnv)
  rm(data)
}



Get Common Columns

df_names <- ls()[sapply(mget(ls(), .GlobalEnv), is.data.frame)]

## extract the column names for each data frame
column_lists <- lapply(df_names, function(name) colnames(get(name)))

## find common columns across all data frames
common_cols <- Reduce(intersect, column_lists)

# Print the result
print(common_cols)
##  [1] "SYSXM"         "SYSIND"        "SYSGP"         "SYSGPSTUDY"   
##  [5] "SYSINDGP"      "CGI_ORDER"     "GPS_ORDER"     "STDCGI_ORDER" 
##  [9] "LSTUDY"        "DB_OWNER"      "STUDY"         "SUBSTUDY"     
## [13] "CENTER"        "GP"            "IND"           "REFCTR"       
## [17] "DATE_OF_BIRTH"



Save all Colnames

## This code only needs to be run once.
## It will generate an Excel file where each sheet contains the column names for its corresponding dataset.
## Then I will use this excel and fill the infor for each variable to generate the revise DD: 
# Create a new workbook
wb <- createWorkbook()

# For each data frame, add a sheet with its column names
# Loop through each data frame
for (df_name in df_names) {
  df <- get(df_name)  # get the actual data frame
  col_names_df <- data.frame(VarNames = colnames(df))  # create single-column df
  
  # Add sheet with df name (truncated to 31 characters max)
  sheet_name <- substr(df_name, 1, 31)
  addWorksheet(wb, sheetName = sheet_name)
  
  # Write the column names into the sheet
  writeData(wb, sheet = sheet_name, col_names_df)
}

# Save the workbook
## define the name and location of this file to save
saveWorkbook(wb, file = colnames_file, overwrite = TRUE)



Variable Check Per File

AAAD_GERIAT

df <- AAAD_GERIAT

info(AAAD_GERIAT,"SYSIND")
## #obs:1051, cols:62, inds:939
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    1051 obs. of  62 variables:
##  $ SYSXM              : num  7534713 7540453 7540583 7540653 7540803 ...
##  $ SYSIND             : num  11108883 11006263 11048913 11048883 11059623 ...
##  $ SYSGP              : num  7920393 7888673 7896183 7896183 7897223 ...
##  $ SYSGPSTUDY         : num  1357713 1304013 1311503 1311503 1312543 ...
##  $ SYSINDGP           : num  7868403 7761063 7804773 7804743 7818553 ...
##  $ CGI_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER       : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY             : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER           : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY              : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY           : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER             : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                 : num  87634 87534 87657 87657 87699 ...
##  $ IND                : num  1 104 102 1000 101 108 1 1 1 1 ...
##  $ REFCTR             : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE          : POSIXct, format: "2018-01-12" "2018-02-21" ...
##  $ EXAMINER           : chr  "axr1589" "v.rodriguez4" "v.rodriguez4" "axr1589" ...
##  $ DATE_OF_BIRTH      : POSIXct, format: "1950-06-03" "1936-09-20" ...
##  $ AGE_AT_EXAM        : num  67 81 72 94 88 68 67 71 68 69 ...
##  $ SATISFIED_LIFE     : chr  "Y" "Y" "Y" "Y" ...
##  $ DROPPED_ACTIVITIES : chr  "N" "N" "N" "N" ...
##  $ FEEL_EMPTY         : chr  "N" "N" "N" "N" ...
##  $ GOOD_SPIRIT        : chr  "Y" "Y" "Y" "Y" ...
##  $ AFRAID_BAD_THINGS  : chr  "N" "N" "N" "N" ...
##  $ BORED              : chr  "N" "N" "N" "Y" ...
##  $ FEEL_HAPPY         : chr  "Y" "Y" "Y" "Y" ...
##  $ FEEL_HELPLESS      : chr  "N" "N" "N" "N" ...
##  $ STAY_HOME          : chr  "N" "N" "N" "N" ...
##  $ MEMORY_PROBLEM     : chr  "N" "N" "N" "N" ...
##  $ ALIVE              : chr  "Y" "Y" "Y" "Y" ...
##  $ FEEL_WORTHLESS     : chr  "N" "N" "N" "N" ...
##  $ FEEL_FULL_ENERGY   : chr  "Y" "Y" "Y" "Y" ...
##  $ FEEL_HOPELESS      : chr  "N" "N" "N" "N" ...
##  $ OTHER_BETTER_OFF   : chr  "N" "N" "N" "N" ...
##  $ TROUBLE_FALL_ASLEEP: logi  NA NA NA NA NA NA ...
##  $ TROUBLE_STAY_ASLEEP: logi  NA NA NA NA NA NA ...
##  $ SLEEPING_TOO_MUCH  : logi  NA NA NA NA NA NA ...
##  $ APPETITE_INCREASED : logi  NA NA NA NA NA NA ...
##  $ APPETITE_DECREASED : logi  NA NA NA NA NA NA ...
##  $ WEIGHT_LOSS        : logi  NA NA NA NA NA NA ...
##  $ AMOUNT_WEIGHT_LOSS : logi  NA NA NA NA NA NA ...
##  $ SATISFYING_LIFE    : logi  NA NA NA NA NA NA ...
##  $ COMMENTS           : logi  NA NA NA NA NA NA ...
##  $ RELIABLE           : logi  NA NA NA NA NA NA ...
##  $ LIFE_SCORE         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ACTIVITY_SCORE     : num  0 0 0 0 1 0 0 0 1 1 ...
##  $ EMPTY_SCORE        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ BORED_SCORE        : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ SPIRIT_SCORE       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ AFRAID_SCORE       : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ HAPPY_SCORE        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ HELPLESS_SCORE     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ STAY_HOME_SCORE    : num  0 0 0 0 1 1 0 0 1 0 ...
##  $ MEMORY_SCORE       : num  0 0 0 0 1 0 0 0 0 1 ...
##  $ ALIVE_SCORE        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ WORTHLESS_SCORE    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FULL_ENERGY_SCORE  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ HOPELESS_SCORE     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BETTER_OFF_SCORE   : num  0 0 0 0 0 0 1 0 0 1 ...
##  $ TOTAL_STATUS       : chr  NA NA NA NA ...
##  $ TOTAL              : num  0 0 0 1 3 1 2 0 3 6 ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "AAAD_GERIAT")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 11 vars 

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 11 × 2
##    VarNames            `Data Type`
##    <chr>               <chr>      
##  1 REFCTR              VARCHAR2(6)
##  2 TROUBLE_FALL_ASLEEP <NA>       
##  3 TROUBLE_STAY_ASLEEP <NA>       
##  4 SLEEPING_TOO_MUCH   <NA>       
##  5 APPETITE_INCREASED  <NA>       
##  6 APPETITE_DECREASED  <NA>       
##  7 WEIGHT_LOSS         <NA>       
##  8 AMOUNT_WEIGHT_LOSS  <NA>       
##  9 SATISFYING_LIFE     <NA>       
## 10 COMMENTS            <NA>       
## 11 RELIABLE            <NA>
## converted to character
convert2chr <-c("REFCTR")

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "logical"
## NOTE: For the other 10 variables, the DD does not provide data type information, so I’m leaving them unspecified for now.


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2018-01-12    1950-06-03
## 2 2018-02-21    1936-09-20
## 3 2018-02-19    1946-01-11
## 4 2018-02-19    1923-04-17
## 5 2018-02-18    1929-10-08
## 6 2018-02-19    1949-08-01
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"      "logical"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## 23 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER, as I assume we can have multiple examiners


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 27 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

AAAD_GERIAT <- df



AAAD_MEDCON

df <- AAAD_MEDCON

info(AAAD_MEDCON,"SYSIND")
## #obs:397, cols:256, inds:367
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    397 obs. of  256 variables:
##  $ SYSXM                     : num  7134193 7839953 7838803 7838853 7838933 ...
##  $ SYSIND                    : num  11010563 11368403 11368463 11368453 11368443 ...
##  $ SYSGP                     : num  7889553 7950923 7950983 7950973 7950963 ...
##  $ SYSGPSTUDY                : num  1304893 1396033 1396093 1396083 1396073 ...
##  $ SYSINDGP                  : num  7765583 8137673 8137733 8137723 8137713 ...
##  $ CGI_ORDER                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER              : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                    : chr  "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER                  : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                     : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                  : chr  "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ CENTER                    : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                        : num  87580 88413 88419 88418 88417 ...
##  $ IND                       : num  1 1 1 1 1 100 1 1 1 1 ...
##  $ REFCTR                    : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                 : POSIXct, format: "2016-09-07" "2020-03-06" ...
##  $ EXAMINER                  : chr  "ladams4" "sjt82" "mxp1257" "mxp1257" ...
##  $ DATE_OF_BIRTH             : POSIXct, format: "1925-08-21" "1928-02-14" ...
##  $ AGE_AT_EXAM               : num  91 92 74 75 75 64 83 77 79 85 ...
##  $ REVIEW_DATE               : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                  : logi  NA NA NA NA NA NA ...
##  $ MEMORY_COMPLAINTS         : num  NA 1 0 0 0 1 0 0 0 0 ...
##  $ DATE_OF_ONSET             : POSIXct, format: NA NA ...
##  $ DOA_UNK                   : chr  NA "U" NA NA ...
##  $ DESCRIBE                  : chr  "hello" "No family history of dementia." "Brother with PD." "Denies psychological/psychiatric conditions.  Brother, mother, aunt (father's sister) and grandparents with AD "| __truncated__ ...
##  $ MEM_COMPLAINTS            : chr  NA "misplaces objects  Informant: Daughter and self 92 YO widow female who lives with daughter and  visits senior c"| __truncated__ "None. Lives alone. Can cook, clean and take care of self. Fully capable of self care and fully oriented. Drives"| __truncated__ "none. denies memoery issues. Lives with daughter. can cook, clean and do chores but daughter helps. Says she co"| __truncated__ ...
##  $ CURRENT_MED               : chr  NA NA "Hypertension and Thyroid issues" "Hypertension + neuropathy + metformin" ...
##  $ PMH                       : chr  NA "HTN" "Hypertension and Thyroid issues" NA ...
##  $ MOOD_CHANGES              : chr  NA "denies" "denies." NA ...
##  $ MEDICATIONS               : chr  NA "lisinopril, clopidogrel" NA "not collected" ...
##  $ HYPERTENSION_DX           : num  NA 1 NA NA NA 1 NA 0 1 1 ...
##  $ HYPERTENSION_TREATED      : num  NA 1 NA NA NA 1 NA 0 NA NA ...
##  $ DIABETES_DX               : num  NA 0 NA NA NA 1 NA 0 0 1 ...
##  $ DIABETES_TREATED          : num  NA NA NA NA NA 1 NA 0 NA NA ...
##  $ MYOCARDIAL_DX             : num  NA 0 NA NA NA 0 NA 0 0 1 ...
##  $ MYOCARDIAL_TREATED        : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ HEART_FAILURE_DX          : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ HEART_FAILURE_TREATED     : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ HEART_DISEASE_DX          : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ HEART_DISEASE_TREATED     : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ COPD_DX                   : num  NA 0 NA NA NA 0 NA 0 0 1 ...
##  $ COPD_TREATED              : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ THYROID_DX                : num  NA 0 NA NA NA 0 NA 1 0 0 ...
##  $ THYROID_TREATED           : num  NA NA NA NA NA 0 NA 1 NA NA ...
##  $ LIVER_DX                  : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ LIVER_TREATED             : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ RENAL_DX                  : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ RENAL_TREATED             : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ PEPTIC_DX                 : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ PEPTIC_TREATED            : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ PERIPHERAL_DX             : num  NA 0 NA NA NA 0 NA 1 0 0 ...
##  $ PERIPHERAL_TREATED        : num  NA NA NA NA NA 0 NA 1 NA NA ...
##  $ STROKE_DX                 : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ STROKE_TREATED            : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ TIA_DX                    : num  NA 0 NA NA NA 0 NA 0 0 1 ...
##  $ TIA_TREATED               : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ HEAD_INJURY_DX            : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ HEAD_INJURY_TREATED       : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ SEIZURE_DX                : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ SEIZURE_TREATED           : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ CANCER_DX                 : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ CANCER_TREATED            : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ ARTHRITIS_DX              : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ ARTHRITIS_TREATED         : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ SYPHILIS_DX               : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ SYPHILIS_TREATED          : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ ALCOHOL_DX                : num  NA 0 NA NA NA 0 NA 0 0 1 ...
##  $ ALCOHOL_TREATED           : num  NA NA NA NA NA 0 NA 0 0 9 ...
##  $ ILLICIT_DRUG_DX           : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ ILLICIT_DRUG_TREATED      : num  NA NA NA NA NA 0 NA 0 0 9 ...
##  $ SMOKING_DX                : num  NA 0 NA NA NA 0 NA 0 0 1 ...
##  $ SMOKING_TREATED           : num  NA NA NA NA NA 0 NA 0 0 9 ...
##  $ PD_DX                     : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ PD_TREATED                : num  NA NA NA NA NA 0 NA 0 0 9 ...
##  $ HUNTINGTON_DX             : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ HUNTINGTON_TREATED        : num  NA NA NA NA NA 0 NA 0 0 9 ...
##  $ MULTIPLE_SCLEROSIS_DX     : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ MULTIPLE_SCLEROSIS_TREATED: num  NA NA NA NA NA 0 NA 0 NA 9 ...
##  $ B12_DX                    : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ B12_TREATED               : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ HYDROCEPHALUS_DX          : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ HYDROCEPHALUS_TREATED     : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ TREMOR_DX                 : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ TREMOR_TREATED            : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ DOWN_SYNDROME_DX          : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ DOWN_SYNDROME_TREATED     : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ MED_CONDITIONS_DX         : num  NA 0 NA NA NA 0 NA 0 0 0 ...
##  $ MED_CONDITIONS_TREATED    : num  NA NA NA NA NA 0 NA 0 NA NA ...
##  $ OTH_MED_COND_SP           : chr  NA NA NA NA ...
##  $ STROKE_BRAIN              : num  NA 0 NA NA NA 0 NA 0 9 1 ...
##  $ DOCTOR                    : num  NA NA NA NA NA 0 NA 0 9 1 ...
##  $ STROKE_PAST               : num  NA NA NA NA NA 0 NA 0 9 1 ...
##  $ STROKE_24HRS              : num  NA NA NA NA NA 0 NA 0 9 0 ...
##  $ SYMPTOMS                  : num  NA NA NA NA NA 0 NA 0 9 0 ...
##  $ LOST_SPEECH               : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ LOST_UNDERSTAND           : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ LOSS_CONSCIOUS            : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ WEAKNESS                  : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ NUMBNESS                  : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ LOSS_VISION               : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ HALF_VISION               : num  NA 0 NA NA NA 0 NA 0 9 9 ...
##  $ PERIOD                    : num  NA 9 NA NA NA 9 NA 0 9 0 ...
##  $ AGE                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DONT_KNOW                 : chr  NA NA NA NA ...
##  $ SEEK_HELP                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TREATMENT                 : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ MEDS                      : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ PSYCHOTHERAPY             : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ OTHER                     : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ SPECIFY                   : chr  NA NA NA NA ...
##  $ UNKNOWN                   : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ TAKING_MEDS               : num  NA NA NA NA NA NA NA 1 NA NA ...
##  $ MEDICATION1               : chr  NA NA NA NA ...
##  $ STRENGTH1                 : chr  NA NA NA NA ...
##  $ SEEN1                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN1_SPEC                : chr  NA NA NA NA ...
##  $ MEDICATION2               : chr  NA NA NA NA ...
##  $ STRENGTH2                 : chr  NA NA NA NA ...
##  $ SEEN2                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN2_SPEC                : chr  NA NA NA NA ...
##  $ MEDICATION3               : chr  NA NA NA NA ...
##  $ STRENGTH3                 : chr  NA NA NA NA ...
##  $ SEEN3                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN3_SPEC                : chr  NA NA NA NA ...
##  $ MEDICATION4               : chr  NA NA NA NA ...
##  $ STRENGTH4                 : chr  NA NA NA NA ...
##  $ SEEN4                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN4_SPEC                : chr  NA NA NA NA ...
##  $ MEDICATION5               : chr  NA NA NA NA ...
##  $ STRENGTH5                 : chr  NA NA NA NA ...
##  $ SEEN5                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN5_SPEC                : logi  NA NA NA NA NA NA ...
##  $ MEDICATION6               : chr  NA NA NA NA ...
##  $ STRENGTH6                 : chr  NA NA NA NA ...
##  $ SEEN6                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN6_SPEC                : logi  NA NA NA NA NA NA ...
##  $ MEDICATION7               : chr  NA NA NA NA ...
##  $ STRENGTH7                 : chr  NA NA NA NA ...
##  $ SEEN7                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN7_SPEC                : logi  NA NA NA NA NA NA ...
##  $ MEDICATION8               : chr  NA NA NA NA ...
##  $ STRENGTH8                 : chr  NA NA NA NA ...
##  $ SEEN8                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN8_SPEC                : logi  NA NA NA NA NA NA ...
##  $ MEDICATION9               : chr  NA NA NA NA ...
##  $ STRENGTH9                 : chr  NA NA NA NA ...
##  $ SEEN9                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN9_SPEC                : logi  NA NA NA NA NA NA ...
##  $ MEDICATION10              : chr  NA NA NA NA ...
##  $ STRENGTH10                : chr  NA NA NA NA ...
##  $ SEEN10                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN10_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION11              : chr  NA NA NA NA ...
##  $ STRENGTH11                : chr  NA NA NA NA ...
##  $ SEEN11                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN11_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION12              : chr  NA NA NA NA ...
##  $ STRENGTH12                : chr  NA NA NA NA ...
##  $ SEEN12                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN12_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION13              : chr  NA NA NA NA ...
##  $ STRENGTH13                : chr  NA NA NA NA ...
##  $ SEEN13                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN13_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION14              : chr  NA NA NA NA ...
##  $ STRENGTH14                : chr  NA NA NA NA ...
##  $ SEEN14                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN14_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION15              : chr  NA NA NA NA ...
##  $ STRENGTH15                : logi  NA NA NA NA NA NA ...
##  $ SEEN15                    : logi  NA NA NA NA NA NA ...
##  $ SEEN15_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION16              : chr  NA NA NA NA ...
##  $ STRENGTH16                : logi  NA NA NA NA NA NA ...
##  $ SEEN16                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN16_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION17              : chr  NA NA NA NA ...
##  $ STRENGTH17                : logi  NA NA NA NA NA NA ...
##  $ SEEN17                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SEEN17_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION18              : logi  NA NA NA NA NA NA ...
##  $ STRENGTH18                : logi  NA NA NA NA NA NA ...
##  $ SEEN18                    : logi  NA NA NA NA NA NA ...
##  $ SEEN18_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION19              : logi  NA NA NA NA NA NA ...
##  $ STRENGTH19                : logi  NA NA NA NA NA NA ...
##  $ SEEN19                    : logi  NA NA NA NA NA NA ...
##  $ SEEN19_SPEC               : logi  NA NA NA NA NA NA ...
##  $ MEDICATION20              : logi  NA NA NA NA NA NA ...
##  $ STRENGTH20                : logi  NA NA NA NA NA NA ...
##  $ SEEN20                    : logi  NA NA NA NA NA NA ...
##  $ SEEN20_SPEC               : logi  NA NA NA NA NA NA ...
##  $ NOTES                     : chr  NA NA NA NA ...
##  $ WARFARIN                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ASPIRIN                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIPLATELETS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIURETICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTICONVULSANTS           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ INSULIN                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYPOGLYCEMICS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SULFONYLUREA              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ METFORMIN                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GLITAZONES                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIGITALIS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NITRATES                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CALCIUM_CHANNEL           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BETA_2_AGAONIST           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BETA_BLOCKERS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ACE                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTI_ARRHYTHMICS          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTI_HYPERLIPIDEMICS      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ STATIN_DRUG               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FIBRATE_DRUG              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ THYROID                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTICHOLINERGICS          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LEVODOPA                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOPAMINE                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIDEPRESSANTS           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIPSYCHOTICS            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANXIOLYTICS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CHOLINESTERASE            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RIVASTIGMINE              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TACRINE                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DONEPEZIL                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GALANTAMINE               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NMDA                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEMANTINE                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ALPHA_BLOCKERS            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYPNOTICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ H1_BLOCKERS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ H2_BLOCKERS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NSAID                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COX2                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NARCOTICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYDERGINE                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DEPRENYL                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ESTROGEN_SUPP             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRESCRIPTION              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OTC                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ STEROIDS                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OTHER_MEDS                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPEC_MEDS                 : chr  NA NA NA NA ...
##  $ MULTIVITAMINS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_C                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_E                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMINE_B12              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COENZYME_Q                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DHA                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LECITHIN                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GINKGO                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FOLIC_ACID                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_B6                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_D                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OMEGA3                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDCOND_COMENTS           : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "AAAD_MEDCON")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 33 vars

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 33 × 2
##    VarNames    `Data Type`  
##    <chr>       <chr>        
##  1 REFCTR      VARCHAR2(6)  
##  2 REVIEW_DATE date         
##  3 REVIEWER    VARCHAR      
##  4 SEEN5_SPEC  VARCHAR2(100)
##  5 SEEN6_SPEC  VARCHAR2(100)
##  6 SEEN7_SPEC  VARCHAR2(100)
##  7 SEEN8_SPEC  VARCHAR2(100)
##  8 SEEN9_SPEC  VARCHAR2(100)
##  9 SEEN10_SPEC VARCHAR2(100)
## 10 SEEN11_SPEC VARCHAR2(100)
## # ℹ 23 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## "SEEN15" "SEEN18" "SEEN19" "SEEN20"

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 28 vars

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" "DATE_OF_ONSET"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "REVIEW_DATE, ignore it, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH DATE_OF_ONSET
## 1 2016-09-07    1925-08-21          <NA>
## 2 2020-03-06    1928-02-14          <NA>
## 3 2020-03-06    1945-09-01          <NA>
## 4 2020-03-06    1944-03-22          <NA>
## 5 2020-03-06    1944-10-17          <NA>
## 6 2020-03-05    1955-06-26    2016-09-09
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## 81 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## DOA_UNK, ignore, I have updated DD to "char"
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 31 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                           "1 thru 99999;"             
## [3] "1 thru 9999;"               "0;\r\n1;"                  
## [5] "0;\r\n1;\r\n9;\r\n-1;"      "0;\r\n1;\r\n7;\r\n8;\r\n9;"
## [7] "0;\r\n1;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP and IND


Save Cleaned Data

AAAD_MEDCON <- df



AAAD_SOCIO_DEMO

df <- AAAD_SOCIO_DEMO

info(AAAD_SOCIO_DEMO,"SYSIND")
## #obs:402, cols:161, inds:391
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    402 obs. of  161 variables:
##  $ SYSXM        : num  7895153 7875263 7879973 7879993 7880213 ...
##  $ SYSIND       : num  11218613 11036843 11041143 11041043 11005233 ...
##  $ SYSGP        : num  7928123 7893863 7894373 7894373 7888553 ...
##  $ SYSGPSTUDY   : num  1366233 1309183 1309693 1309693 1303893 ...
##  $ SYSINDGP     : num  7981883 7792583 7797003 7796903 7760033 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  87998 87598 87502 87502 87501 ...
##  $ IND          : num  1 1 102 100 1 1 1 1 1 1 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2021-02-01" "2020-07-14" ...
##  $ EXAMINER     : chr  "sjt82" "v.rodriguez4" "prm72" "prm72" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1943-09-22" "1946-10-04" ...
##  $ AGE_AT_EXAM  : num  77 73 69 71 81 84 85 77 83 85 ...
##  $ REVIEW_DATE  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER     : logi  NA NA NA NA NA NA ...
##  $ SDF1         : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ SDF2         : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ SDF2A        : chr  "WH" NA NA NA ...
##  $ SDF3         : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ SDF3A        : chr  "SPANISH" NA NA NA ...
##  $ SDF4         : chr  "W" NA NA NA ...
##  $ SDF4A        : chr  NA NA NA NA ...
##  $ SDF5         : chr  "A" NA NA NA ...
##  $ SDF5A        : chr  NA NA NA NA ...
##  $ SDF6         : chr  "SF" NA NA NA ...
##  $ SDF6A        : chr  NA NA NA NA ...
##  $ SDF7         : chr  NA NA NA NA ...
##  $ SDF8A        : chr  "PT" NA NA NA ...
##  $ SDF8B        : chr  NA NA NA NA ...
##  $ SDF8C        : chr  NA NA NA NA ...
##  $ SDF9         : chr  NA NA NA NA ...
##  $ SDF10        : chr  NA NA NA NA ...
##  $ SDF11        : chr  NA NA NA NA ...
##  $ SDF12        : logi  NA NA NA NA NA NA ...
##  $ SDF13        : logi  NA NA NA NA NA NA ...
##  $ SDF14        : logi  NA NA NA NA NA NA ...
##  $ SDF15        : logi  NA NA NA NA NA NA ...
##  $ SDF16        : logi  NA NA NA NA NA NA ...
##  $ SDF17        : logi  NA NA NA NA NA NA ...
##  $ SDF17A       : logi  NA NA NA NA NA NA ...
##  $ SDF18        : logi  NA NA NA NA NA NA ...
##  $ SDF19        : logi  NA NA NA NA NA NA ...
##  $ SDF20        : chr  "TEACHER ASSISTANT" "UNKNOWN" "Computer service tech" "Engineer" ...
##  $ SDF21        : chr  NA NA NA NA ...
##  $ SDF22        : logi  NA NA NA NA NA NA ...
##  $ SDF22A       : chr  NA NA NA NA ...
##  $ SDF23        : logi  NA NA NA NA NA NA ...
##  $ SDF24        : logi  NA NA NA NA NA NA ...
##  $ SDF25        : logi  NA NA NA NA NA NA ...
##  $ SDF26        : logi  NA NA NA NA NA NA ...
##  $ SDF27A       : logi  NA NA NA NA NA NA ...
##  $ SDF27B       : logi  NA NA NA NA NA NA ...
##  $ SDF27C       : logi  NA NA NA NA NA NA ...
##  $ SDF27D       : logi  NA NA NA NA NA NA ...
##  $ SDF27E       : logi  NA NA NA NA NA NA ...
##  $ SDF27F       : logi  NA NA NA NA NA NA ...
##  $ SDF27G       : logi  NA NA NA NA NA NA ...
##  $ SDF27H       : logi  NA NA NA NA NA NA ...
##  $ SDF28        : logi  NA NA NA NA NA NA ...
##  $ SDF29        : logi  NA NA NA NA NA NA ...
##  $ SDF30A       : logi  NA NA NA NA NA NA ...
##  $ SDF30B       : logi  NA NA NA NA NA NA ...
##  $ SDF30C       : logi  NA NA NA NA NA NA ...
##  $ SDF30D       : logi  NA NA NA NA NA NA ...
##  $ SDF30E       : logi  NA NA NA NA NA NA ...
##  $ SDF30F       : logi  NA NA NA NA NA NA ...
##  $ SDF30G       : logi  NA NA NA NA NA NA ...
##  $ SDF31        : logi  NA NA NA NA NA NA ...
##  $ SDF31A       : logi  NA NA NA NA NA NA ...
##  $ SDF32        : logi  NA NA NA NA NA NA ...
##  $ SDF33        : logi  NA NA NA NA NA NA ...
##  $ SDF33A       : logi  NA NA NA NA NA NA ...
##  $ SDF34        : logi  NA NA NA NA NA NA ...
##  $ SDF35        : logi  NA NA NA NA NA NA ...
##  $ SDF36        : logi  NA NA NA NA NA NA ...
##  $ SDF37        : logi  NA NA NA NA NA NA ...
##  $ SDF38        : logi  NA NA NA NA NA NA ...
##  $ SDF39        : logi  NA NA NA NA NA NA ...
##  $ SDF40        : logi  NA NA NA NA NA NA ...
##  $ SDF41        : logi  NA NA NA NA NA NA ...
##  $ SDF42        : logi  NA NA NA NA NA NA ...
##  $ SDF42A       : logi  NA NA NA NA NA NA ...
##  $ SDF42B       : logi  NA NA NA NA NA NA ...
##  $ SDF43A       : logi  NA NA NA NA NA NA ...
##  $ SDF43A1      : logi  NA NA NA NA NA NA ...
##  $ SDF43B       : logi  NA NA NA NA NA NA ...
##  $ SDF43B1      : logi  NA NA NA NA NA NA ...
##  $ SDF43C       : logi  NA NA NA NA NA NA ...
##  $ SDF43C1      : logi  NA NA NA NA NA NA ...
##  $ SDF44        : logi  NA NA NA NA NA NA ...
##  $ SDF44A       : logi  NA NA NA NA NA NA ...
##  $ SDF45A       : logi  NA NA NA NA NA NA ...
##  $ SDF45A1      : logi  NA NA NA NA NA NA ...
##  $ SDF45B       : logi  NA NA NA NA NA NA ...
##  $ SDF45B1      : logi  NA NA NA NA NA NA ...
##  $ SDF46        : logi  NA NA NA NA NA NA ...
##  $ SDF47        : logi  NA NA NA NA NA NA ...
##  $ SDF48        : logi  NA NA NA NA NA NA ...
##  $ SDF49A       : logi  NA NA NA NA NA NA ...
##  $ SDF49B       : logi  NA NA NA NA NA NA ...
##  $ SDF49C       : logi  NA NA NA NA NA NA ...
##  $ SDF49D       : logi  NA NA NA NA NA NA ...
##  $ SDF50A       : logi  NA NA NA NA NA NA ...
##  $ SDF50B       : logi  NA NA NA NA NA NA ...
##  $ SDF50C       : logi  NA NA NA NA NA NA ...
##  $ SDF50D       : logi  NA NA NA NA NA NA ...
##  $ SDF51        : logi  NA NA NA NA NA NA ...
##  $ SDF51A       : logi  NA NA NA NA NA NA ...
##  $ SDF52A       : logi  NA NA NA NA NA NA ...
##  $ SDF52B       : logi  NA NA NA NA NA NA ...
##  $ SDF53A       : logi  NA NA NA NA NA NA ...
##  $ SDF53A1      : logi  NA NA NA NA NA NA ...
##  $ SDF53B       : logi  NA NA NA NA NA NA ...
##  $ SDF53C       : logi  NA NA NA NA NA NA ...
##  $ SDF53C1      : logi  NA NA NA NA NA NA ...
##  $ SDF54        : logi  NA NA NA NA NA NA ...
##  $ SDF55        : logi  NA NA NA NA NA NA ...
##  $ SDF56        : logi  NA NA NA NA NA NA ...
##  $ SDF57A       : logi  NA NA NA NA NA NA ...
##  $ SDF57B       : logi  NA NA NA NA NA NA ...
##  $ SDF57C       : logi  NA NA NA NA NA NA ...
##  $ SDF57D       : logi  NA NA NA NA NA NA ...
##  $ SDF58A       : logi  NA NA NA NA NA NA ...
##  $ SDF58B       : logi  NA NA NA NA NA NA ...
##  $ SDF58C       : logi  NA NA NA NA NA NA ...
##  $ SDF58D       : logi  NA NA NA NA NA NA ...
##  $ SDF59        : logi  NA NA NA NA NA NA ...
##  $ SDF59A       : logi  NA NA NA NA NA NA ...
##  $ SDF60A       : logi  NA NA NA NA NA NA ...
##  $ SDF60B       : logi  NA NA NA NA NA NA ...
##  $ SDF60C       : logi  NA NA NA NA NA NA ...
##  $ SDF60D       : logi  NA NA NA NA NA NA ...
##  $ SDF60E       : logi  NA NA NA NA NA NA ...
##  $ SDF60F       : logi  NA NA NA NA NA NA ...
##  $ SDF60FS      : logi  NA NA NA NA NA NA ...
##  $ SDF60G       : logi  NA NA NA NA NA NA ...
##  $ SDF60GS      : logi  NA NA NA NA NA NA ...
##  $ SDF61A       : logi  NA NA NA NA NA NA ...
##  $ SDF61A1      : logi  NA NA NA NA NA NA ...
##  $ SDF61B       : logi  NA NA NA NA NA NA ...
##  $ SDF61C       : logi  NA NA NA NA NA NA ...
##  $ SDF61C1      : logi  NA NA NA NA NA NA ...
##  $ SDF62A       : logi  NA NA NA NA NA NA ...
##  $ SDF62B       : logi  NA NA NA NA NA NA ...
##  $ SDF62C       : logi  NA NA NA NA NA NA ...
##  $ SDF62D       : logi  NA NA NA NA NA NA ...
##  $ SDF63A       : logi  NA NA NA NA NA NA ...
##  $ SDF63B       : logi  NA NA NA NA NA NA ...
##  $ SDF63C       : logi  NA NA NA NA NA NA ...
##  $ SDF63D       : logi  NA NA NA NA NA NA ...
##  $ SDF64        : logi  NA NA NA NA NA NA ...
##  $ SDF65        : logi  NA NA NA NA NA NA ...
##  $ SDF65A       : logi  NA NA NA NA NA NA ...
##  $ SDF66        : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "AAAD_SOCIO_DEMO")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 121

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 121 × 2
##    VarNames    `Data Type`  
##    <chr>       <chr>        
##  1 REFCTR      VARCHAR2(6)  
##  2 REVIEW_DATE date         
##  3 REVIEWER    VARCHAR      
##  4 SDF12       VARCHAR2(200)
##  5 SDF13       NUMBER(3)    
##  6 SDF14       CHAR(2)      
##  7 SDF15       CHAR(2)      
##  8 SDF16       CHAR(2)      
##  9 SDF17       NUMBER(2)    
## 10 SDF17A      CHAR(2)      
## # ℹ 111 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## 55 vars

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)]
## [1] "REVIEW_DATE"

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 65 vars

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, can ignore, since it has been converted in last step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2021-02-01    1943-09-22
## 2 2020-07-14    1946-10-04
## 3 2020-09-16    1950-10-02
## 4 2020-09-16    1949-04-30
## 5 2019-05-22    1937-10-24
## 6 2020-09-17    1935-10-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 89 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
unique(df[["SDF8A"]]) ## DD: 8. Are you working now? Circle all that apply
## [1] "PT"     NA       "FT"     "V"      "FT  PT"
unique(df[["SDF8B"]]) ## DD: If participant says NO, ask Why not? If any of the following SKIP TO #20
##  [1] NA       "O"      "FT"     "R"      "PD"     "FT  PD" "R  O"   "U  R"  
##  [9] "S"      "U"      "IS"     "T"      "T  R"   "R  PD"
## NOTE: these two variables are good, as they being marked "Multiple" in the [Single, Multiple or Calculated Values] column of DD


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 69 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
##  [1] NA                            "1 thru 99999;"              
##  [3] "1 thru 9999;"                "1;\r\n0;"                   
##  [5] "1 thru 145;"                 "1 thru 31;"                 
##  [7] "-2;"                         "0 thru 8;"                  
##  [9] "1;\r\n2;\r\n3;\r\n4;\r\n-2;" "0 thru 100;"                
## [11] "0 thru 5;"                   "0;\r\n1;\r\n-2;"            
## [13] "1;\r\n0;\r\n-2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

AAAD_SOCIO_DEMO <- df



AAAD_TRAILS

df <- AAAD_TRAILS

info(AAAD_TRAILS,"SYSIND")
## #obs:439, cols:34, inds:428
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    439 obs. of  34 variables:
##  $ SYSXM        : num  7670123 7650923 7651273 7659813 7660113 ...
##  $ SYSIND       : num  11221133 11218963 11219583 11036793 11221813 ...
##  $ SYSGP        : num  7929223 7928203 7928153 7893833 7929683 ...
##  $ SYSGPSTUDY   : num  1367333 1366313 1366263 1309153 1367793 ...
##  $ SYSINDGP     : num  7984403 7982233 7982853 7792533 7985083 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  88063 88095 88002 87595 88059 ...
##  $ IND          : num  1 1 100 9000 1 1 1 1 100 1 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2019-03-07" "2019-03-08" ...
##  $ EXAMINER     : chr  "bxf258" "sjt82" "v.rodriguez4" "v.rodriguez4" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1928-01-11" "1941-05-02" ...
##  $ AGE_AT_EXAM  : num  91 77 68 62 84 84 77 72 71 65 ...
##  $ REVIEW_DATE  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER     : logi  NA NA NA NA NA NA ...
##  $ TIME_A       : num  NA 70 56 93 71 71 NA 30 275 60 ...
##  $ TIME_AMISS   : num  -1 NA NA NA NA NA NA NA NA NA ...
##  $ ERR_A        : num  0 0 2 1 0 0 1 0 1 0 ...
##  $ ERR_AMISS    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COR_A        : num  24 24 22 23 24 24 23 24 24 24 ...
##  $ COR_AMISS    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TIME_B       : num  NA 225 124 109 240 240 NA 71 NA 90 ...
##  $ TIME_BMISS   : num  -1 NA NA NA NA NA NA NA -2 NA ...
##  $ ERR_B        : num  NA 4 0 0 0 0 NA 0 NA 0 ...
##  $ ERR_BMISS    : num  -1 NA NA NA NA NA NA NA -2 NA ...
##  $ COR_B        : num  NA 20 24 24 24 24 NA 24 NA 24 ...
##  $ COR_BMISS    : num  -1 NA NA NA NA NA NA NA -2 NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "AAAD_TRAILS")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 3

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    VARCHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)]
## [1] "REVIEW_DATE"

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 2 vars

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, can ignore, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2019-03-07    1928-01-11
## 2 2019-03-08    1941-05-02
## 3 2019-03-04    1950-03-25
## 4 2019-03-08    1956-10-19
## 5 2019-03-07    1934-11-07
## 6 2019-03-07    1934-12-03
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 8 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 23 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                  "1 thru 99999;"     "1 thru 9999;"     
## [4] "0 thru 150;"       "-1;\r\n-2;\r\n-3;" "0 thru 40;"       
## [7] "0 thru 24;"        "0 thru 300;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP and IND
## need to contact Mike about other variables: TIME_A, TIME_B, COR_B


Save Cleaned Data

AAAD_TRAILS <- df



ALZ_B9_JUDGE_RC

df <- ALZ_B9_JUDGE_RC

info(ALZ_B9_JUDGE_RC,"SYSIND")
## #obs:483, cols:82, inds:481
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    483 obs. of  82 variables:
##  $ SYSXM                  : num  8276003 8276013 8258753 8259063 8277553 ...
##  $ SYSIND                 : num  11620433 11160523 11034403 11369813 11620763 ...
##  $ SYSGP                  : num  8005513 7923793 7888823 7952013 8005723 ...
##  $ SYSGPSTUDY             : num  1452223 1361903 1304163 1397123 1452433 ...
##  $ SYSINDGP               : num  8389503 7923633 7790023 8139083 8389833 ...
##  $ CGI_ORDER              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER           : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                 : chr  "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER               : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                  : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY               : chr  "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER                 : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                     : num  104507 87883 87556 88301 104457 ...
##  $ IND                    : num  1 1 9001 1 1 ...
##  $ REFCTR                 : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE              : POSIXct, format: "2023-08-09" "2024-02-14" ...
##  $ EXAMINER               : chr  "jjs2031" "gsv32" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH          : POSIXct, format: "1944-06-21" "1939-03-20" ...
##  $ AGE_AT_EXAM            : num  79 84 68 76 76 81 86 73 86 66 ...
##  $ REVIEW_DATE            : logi  NA NA NA NA NA NA ...
##  $ REVIEWER               : logi  NA NA NA NA NA NA ...
##  $ MEMORY_DECLINE         : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ COP_RPT_MEMDECLINE     : num  8 1 0 0 8 8 0 0 0 1 ...
##  $ MEANINGFUL_IMP         : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ IMP_MEMORY             : num  NA 1 NA NA 1 NA 1 NA NA 1 ...
##  $ IMP_ORIENTATION        : num  NA 0 NA NA 1 NA 0 NA NA 1 ...
##  $ IMP_EXEC_FUNC          : num  NA 0 NA NA 1 NA 0 NA NA 1 ...
##  $ IMP_LANGUAGE           : num  NA 0 NA NA 0 NA 0 NA NA 0 ...
##  $ IMP_VISUOSPATIAL       : num  NA 0 NA NA 0 NA 0 NA NA 0 ...
##  $ IMP_ATTENTION          : num  NA 0 NA NA 0 NA 0 NA NA 1 ...
##  $ IMP_FLUCTUATING_COG    : num  NA 0 NA NA 0 NA 0 NA NA 0 ...
##  $ IMP_FLUCTUATING_AGE    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ IMP_OTHER              : num  NA 0 NA NA 0 NA 0 NA NA 0 ...
##  $ IMP_OTH_SPECIFY        : chr  NA NA NA NA ...
##  $ IMP_PREDOMINANT_SYMP   : num  NA 1 NA NA 1 NA 1 NA NA 1 ...
##  $ IMP_PRED_SYMP_OTH      : chr  NA NA NA NA ...
##  $ IMP_MODE_ONSET         : num  NA 1 NA NA 1 NA 1 NA NA 1 ...
##  $ MODE_ONSET6A           : logi  NA NA NA NA NA NA ...
##  $ BEGIN_AGE              : num  NA 83 NA NA 76 NA 86 NA NA 63 ...
##  $ BEHAV_SYMPTOMS         : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ BS_APATHY              : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_DEPRESSED           : num  NA 1 NA NA NA NA NA NA NA 1 ...
##  $ BS_VISUAL_HAL          : num  NA 1 NA NA NA NA NA NA NA 0 ...
##  $ HAL_WELL_INFORMED      : num  NA 1 NA NA NA NA NA NA NA NA ...
##  $ HAL_BEGIN_AGE          : num  NA 83 NA NA NA NA NA NA NA NA ...
##  $ AUDITORY_HAL           : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ ABN_BELIEFS            : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_DISINIBITION        : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_IRRITABILITY        : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_AGITATION           : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_PERSONAL_CHG        : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_REM                 : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ REM_BEGIN_AGE          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BS_ANXIETY             : num  NA 1 NA NA NA NA NA NA NA 0 ...
##  $ BS_OTHER               : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ BS_OTHER_SPEC          : chr  NA NA NA NA ...
##  $ BS_PREDOMINANT_SYMP    : num  NA 2 NA NA NA NA NA NA NA 2 ...
##  $ BS_PRED_SYMP_OTH       : chr  NA NA NA NA ...
##  $ BS_MODE_ONSET          : num  NA 1 NA NA NA NA NA NA NA 2 ...
##  $ BS_MODE_ONSET_OTH      : chr  NA NA NA NA ...
##  $ BS_BEGIN_AGE           : num  NA 74 NA NA NA NA NA NA NA 63 ...
##  $ MOTOR_SYPTOMS          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MS_GAIT1               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_FALLS1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_TREMOR1             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_SLOWNESS1           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_PRED_SYMPTOM        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_MODE_ONSET          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_MODE_ONSET_OTH      : chr  NA NA NA NA ...
##  $ MS_PARKINSONISM        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PARK_BEGIN_AGE         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_ALS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MS_ALS_BEGIN_AGE       : logi  NA NA NA NA NA NA ...
##  $ MS_BEGIN_AGE           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OVERALL_COURSE_DEC     : num  8 1 8 8 1 8 8 8 8 1 ...
##  $ PRED_DOMAIN            : num  8 2 8 8 1 8 8 8 8 1 ...
##  $ LBD_CANDIDATE          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLD_CANDIDATE          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOTES_B9JUDGE          : chr  NA NA NA NA ...
##  $ TOTALSCORE_B9_Q9       : num  0 3 0 0 0 0 0 0 0 1 ...
##  $ TOTALSCORE_B9_Q9_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_B9_JUDGE_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 5 × 2
##   VarNames         `Data Type`  
##   <chr>            <chr>        
## 1 REFCTR           VARCHAR2(6)  
## 2 REVIEW_DATE      date         
## 3 REVIEWER         VARCHAR      
## 4 MODE_ONSET6A     VARCHAR2(100)
## 5 MS_ALS_BEGIN_AGE NUMBER(3)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## MS_ALS_BEGIN_AGE

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"       "REVIEWER"     "MODE_ONSET6A"

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-08-09    1944-06-21
## 2 2024-02-14    1939-03-20
## 3 2023-06-22    1954-08-20
## 4 2024-02-13    1947-05-13
## 5 2023-04-17    1946-12-19
## 6 2024-02-15    1942-09-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)

mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 62 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
##  [1] NA                                                                
##  [2] "1 thru 99999;"                                                   
##  [3] "1 thru 9999;"                                                    
##  [4] "0;\r\n1;\r\n8;"                                                  
##  [5] "0;\r\n1;"                                                        
##  [6] "0;\r\n1;\r\n9;"                                                  
##  [7] "15 thru 110;"                                                    
##  [8] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n99;"             
##  [9] "1;\r\n2;\r\n3;\r\n4;\r\n99;"                                     
## [10] "15 through 110;"                                                 
## [11] "15 through 110;\r\n888;"                                         
## [12] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;\r\n10;\r\n99;"
## [13] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n8;\r\n9;"                          
## [14] "1;\r\n2;\r\n3;\r\n8;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_B9_JUDGE_RC<- df



ALZ_CLINICALSUM

df <- ALZ_CLINICALSUM

info(ALZ_CLINICALSUM,"SYSIND")
## #obs:1484, cols:39, inds:1480
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    1484 obs. of  39 variables:
##  $ SYSXM            : num  8063903 8066823 8067393 8065353 8058883 ...
##  $ SYSIND           : num  11493593 11493813 11493613 11493363 11493633 ...
##  $ SYSGP            : num  7946353 7946353 7946353 7946353 7946353 ...
##  $ SYSGPSTUDY       : num  1387463 1387463 1387463 1387463 1387463 ...
##  $ SYSINDGP         : num  8262663 8262883 8262683 8262433 8262703 ...
##  $ CGI_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER     : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY           : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER         : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY            : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY         : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER           : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP               : num  87545 87545 87545 87545 87545 ...
##  $ IND              : num  9026 1024 144 124 148 ...
##  $ REFCTR           : logi  NA NA NA NA NA NA ...
##  $ DATE_OF_BIRTH    : POSIXct, format: "1973-01-14" "1941-04-03" ...
##  $ LAST_CONTACT_DATE: logi  NA NA NA NA NA NA ...
##  $ LAST_CONTACT_AGE : logi  NA NA NA NA NA NA ...
##  $ AGE_OF_DEATH     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AGE_OF_EXAM      : num  49 80 67 77 64 75 78 61 71 82 ...
##  $ IMPRESSION       : chr  "Affected By Exam" "Affected By Exam" "Affected By Exam" "Affected By Exam" ...
##  $ AD_CATEGORY      : chr  "No Data" "No Data" "No Data" "Definite AD (Exam)" ...
##  $ AGE_OF_ONSET     : num  43 79 65 57 61 73 71 60 65 75 ...
##  $ AOO_DOC_EST_UNK  : chr  "E" "E" "E" "E" ...
##  $ AGE_OF_DIAGNOSIS : num  NA 79 65 57 64 NA 73 NA 65 75 ...
##  $ AODX_UNKNOWN     : chr  "U" NA NA NA ...
##  $ AD_HX_CATEGORY   : chr  NA NA NA NA ...
##  $ UNCLEAR_CATEGORY : chr  NA NA NA NA ...
##  $ DEMENT_NAME      : chr  NA NA NA NA ...
##  $ CLINICAL_EXAMINER: chr  "katrina/DR. VANCE" "JOSE" "JOSE" "Jose Sanchez" ...
##  $ FOLLOW_UP        : chr  "N" "N" "N" "N" ...
##  $ AUTOPSY_DISCUSSED: chr  "ND" "N" "ND" "Y" ...
##  $ AUTOPSY_PLANNED  : chr  "ND" "N" "ND" "N" ...
##  $ VERIFY_DATE      : POSIXct, format: NA NA ...
##  $ VERIFY_USER      : chr  NA NA NA "Jose Javier Sanchez" ...
##  $ COMMENTS         : chr  NA NA NA NA ...
##  $ FORM_DATE        : POSIXct, format: "2022-03-29" "2022-03-30" ...
##  $ FILLED_OUT_BY    : chr  "kxc672" "jjs2031" "jjs2031" "jjs2031" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_CLINICALSUM")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] 

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames          `Data Type`
##   <chr>             <chr>      
## 1 REFCTR            VARCHAR2(6)
## 2 LAST_CONTACT_DATE DATE       
## 3 LAST_CONTACT_AGE  NUMBER(2)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## LAST_CONTACT_AGE

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## LAST_CONTACT_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## REFCTR

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "DATE_OF_BIRTH" "VERIFY_DATE"   "FORM_DATE" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "LAST_CONTACT_DATE" can ignore LAST_CONTACT_DATE, as it has been corrected in previous step
## [1] "LAST_CONTACT_DATE"
head(df[,datecols])
##   DATE_OF_BIRTH VERIFY_DATE  FORM_DATE
## 1    1973-01-14        <NA> 2022-03-29
## 2    1941-04-03        <NA> 2022-03-30
## 3    1955-02-08        <NA> 2022-03-28
## 4    1945-02-17  2022-05-18 2022-03-28
## 5    1958-03-11  2022-05-05 2022-03-29
## 6    1947-02-05  2022-05-05 2022-03-29
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 20 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## [1] "IMPRESSION"       "AD_CATEGORY"      "AD_HX_CATEGORY"   "UNCLEAR_CATEGORY"
## after checking the unique values of variables in the mismatchChrs_1, I believe that they all should be characters
## so I updated the DD for those variables (I changed their data type in DD and switch the values from "Valid Responses" and " Valid Responses Codes" columns)

mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## the descrption for variable FILLED_OUT_BY mentioned this is Dropdown style for people to select, so I belive that multiple values are fine


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_CLINICALSUM<- df



ALZ_CSDD

df <- ALZ_CSDD

info(ALZ_CSDD,"SYSIND")
## #obs:181, cols:42, inds:176
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    181 obs. of  42 variables:
##  $ SYSXM           : num  7555573 7557803 7551403 7550933 7551073 ...
##  $ SYSIND          : num  11006333 11039713 11048273 11063923 11048283 ...
##  $ SYSGP           : num  7888683 7896183 7894423 7894423 7894423 ...
##  $ SYSGPSTUDY      : num  1304023 1311503 1309743 1309743 1309743 ...
##  $ SYSINDGP        : num  7761133 7795453 7804133 7822853 7804143 ...
##  $ CGI_ORDER       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER    : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY          : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER        : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY           : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY        : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER          : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP              : num  87535 87657 87650 87650 87650 ...
##  $ IND             : num  1001 1 105 110 106 ...
##  $ REFCTR          : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE       : POSIXct, format: "2018-04-13" "2018-04-17" ...
##  $ EXAMINER        : chr  "axr1589" "axr1589" "axr1589" "axr1589" ...
##  $ DATE_OF_BIRTH   : POSIXct, format: "1932-02-20" "1947-04-23" ...
##  $ AGE_AT_EXAM     : num  86 70 77 86 82 87 87 85 75 80 ...
##  $ ANXIETY         : num  2 0 0 0 0 2 -1 0 2 1 ...
##  $ SADNESS         : num  0 0 0 0 0 0 -1 2 0 1 ...
##  $ LACK_REACTION   : num  0 1 2 0 0 -1 -1 0 1 1 ...
##  $ IRRITABILITY    : num  1 0 0 2 0 0 -1 0 2 1 ...
##  $ AGITATION       : num  1 0 0 0 0 2 -1 0 1 0 ...
##  $ RETARDATION     : num  0 2 1 1 0 2 -1 0 0 1 ...
##  $ MULTI_COMPLAINTS: num  2 0 1 2 0 -1 -1 0 1 1 ...
##  $ LOSS_INTEREST   : num  1 0 2 0 0 -1 -1 0 2 0 ...
##  $ LOSS_APPETITE   : num  0 1 1 1 0 2 -1 1 1 1 ...
##  $ LOSS_WEIGHT     : num  0 0 2 2 1 2 -1 -1 1 0 ...
##  $ LACK_ENERGY     : num  2 0 2 1 1 2 -1 2 2 1 ...
##  $ DIURNAL_MOOD    : num  1 0 0 -1 0 2 -1 0 -1 0 ...
##  $ DIFF_ASLEEP     : num  0 0 0 0 0 2 -1 0 0 1 ...
##  $ MULTI_AWAKEN    : num  0 1 1 0 0 2 -1 0 0 0 ...
##  $ EARLY_AWAKEN    : num  0 0 1 0 1 0 -1 0 1 0 ...
##  $ SUICIDAL        : num  0 0 0 0 0 -1 -1 0 -1 0 ...
##  $ SELF_ESTEEM     : num  0 0 0 0 0 -1 -1 0 -1 0 ...
##  $ PESSIMISM       : num  0 0 0 0 0 -1 -1 0 -1 0 ...
##  $ MOOD_DELUSIONS  : num  0 0 0 0 0 -1 -1 0 -1 0 ...
##  $ NOTES_MEDS      : chr  NA NA NA NA ...
##  $ CSDD_SCORE      : num  10 5 13 9 3 18 0 5 14 9 ...
##  $ CSDD_COUNT      : num  19 19 19 18 19 12 0 18 14 19 ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_CSDD")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 1

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
##   VarNames `Data Type`
##   <chr>    <chr>      
## 1 REFCTR   VARCHAR2(6)
## converted to character
convert2chr <- c("REFCTR")

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2018-04-13    1932-02-20
## 2 2018-04-17    1947-04-23
## 3 2018-03-15    1940-06-24
## 4 2018-04-03    1931-07-01
## 5 2018-04-03    1935-05-25
## 6 2018-04-24    1930-06-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 8 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 32 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                      "1 thru 99999;"         "1 thru 9999;"         
## [4] "-1;\r\n0;\r\n1;\r\n2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_CSDD <- df



ALZ_EXAM

df <- ALZ_EXAM

info(ALZ_EXAM,"SYSIND")
## #obs:526, cols:80, inds:522
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    526 obs. of  80 variables:
##  $ SYSXM               : num  7541263 7541363 7541493 7540523 7541543 ...
##  $ SYSIND              : num  11109753 11109763 11109783 11048913 11109793 ...
##  $ SYSGP               : num  7921103 7921113 7921133 7896183 7921143 ...
##  $ SYSGPSTUDY          : num  1359213 1359223 1359243 1311503 1359253 ...
##  $ SYSINDGP            : num  7869273 7869283 7869303 7804773 7869313 ...
##  $ CGI_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER        : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY              : chr  "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER            : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY               : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY            : chr  "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ CENTER              : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                  : num  87787 87788 87790 87657 87791 ...
##  $ IND                 : num  1 1 1 102 1 1 1 1 1 1 ...
##  $ REFCTR              : logi  NA NA NA NA NA NA ...
##  $ FORM_DATE           : POSIXct, format: "2018-03-06" "2018-03-06" ...
##  $ FILLED_OUT_BY       : chr  "v.rodriguez4" "bxf258" "v.rodriguez4" "v.rodriguez4" ...
##  $ DATE_OF_BIRTH       : POSIXct, format: "1950-06-30" "1956-12-21" ...
##  $ NEURO_METHOD        : chr  "E" "E" "E" "E" ...
##  $ NEURO_EXAM_DATE     : POSIXct, format: "2018-03-06" "2018-03-06" ...
##  $ NEURO_EXAMINER      : chr  "vanessa r" "briseida felicia" "vanessa" "Vanessa" ...
##  $ MOOD_AFFECT         : chr  "N" "N" "A" "N" ...
##  $ DEPRESSED           : chr  NA NA "Y" NA ...
##  $ MANIC               : chr  NA NA "N" NA ...
##  $ MOOD_OTHER          : chr  NA NA "N" NA ...
##  $ MOOD_OTHER_DSC      : chr  NA NA NA NA ...
##  $ SPEECH              : chr  "N" "N" "N" "N" ...
##  $ DYSARTHRIA          : chr  NA NA NA NA ...
##  $ DYSPHASIA           : chr  NA NA NA NA ...
##  $ SPEECH_OTHER        : chr  NA NA NA NA ...
##  $ SPEECH_OTHER_DSC    : chr  NA NA NA NA ...
##  $ FACIAL_EXPRESSION   : chr  "N" "N" "N" "N" ...
##  $ MASKED_FACE         : chr  NA NA NA NA ...
##  $ FACIAL_OTHER        : chr  NA NA NA NA ...
##  $ FACIAL_OTHER_DSC    : chr  NA NA NA NA ...
##  $ OCULAR_MOVEMENT     : chr  "N" "N" "N" "N" ...
##  $ IMPAIRED_UPGAZE     : chr  NA NA NA NA ...
##  $ OCULAR_OTHER        : chr  NA NA NA NA ...
##  $ OCULAR_OTHER_DSC    : chr  NA NA NA NA ...
##  $ BRADY               : chr  "N" "N" "N" "N" ...
##  $ BRADY_GLOBAL        : chr  NA NA NA NA ...
##  $ SLOWED_RAMS         : chr  NA NA NA NA ...
##  $ BRADY_OTHER         : chr  NA NA NA NA ...
##  $ BRADY_OTHER_DSC     : chr  NA NA NA NA ...
##  $ TREMOR              : chr  "Y" "N" "N" "N" ...
##  $ TREMOR_RESTING      : chr  "N" NA NA NA ...
##  $ TREMOR_ACTION       : chr  "Y" NA NA NA ...
##  $ GAIT                : chr  "N" "N" "N" "N" ...
##  $ DECR_ARM_SWING      : chr  NA NA NA NA ...
##  $ SHUFFLING           : chr  NA NA NA NA ...
##  $ MULTI_STEP          : chr  NA NA NA NA ...
##  $ GAIT_OTHER          : chr  NA NA NA NA ...
##  $ GAIT_OTHER_DSC      : chr  NA NA NA NA ...
##  $ POST_STABILITY      : chr  "N" "N" "N" "N" ...
##  $ MOTOR_TONE          : chr  "N" "N" "N" "N" ...
##  $ RIGIDITY            : chr  NA NA NA NA ...
##  $ COGWHEELING         : chr  NA NA NA NA ...
##  $ SPASTICITY          : chr  NA NA NA NA ...
##  $ FLACCIDITY          : chr  NA NA NA NA ...
##  $ MOTOR_ASYM          : chr  "ND" "ND" "ND" "ND" ...
##  $ REFLEXES_ASYM       : chr  "ND" "N" "ND" "ND" ...
##  $ REFLEXES_HYPERACTIVE: chr  "ND" "N" "ND" "ND" ...
##  $ REFLEXES_DECREASED  : chr  "ND" "N" "ND" "ND" ...
##  $ BABINSKI            : chr  "ND" "N" "ND" "ND" ...
##  $ CLIN_METHOD         : chr  "E" "E" "E" "E" ...
##  $ CLIN_EXAM_DATE      : POSIXct, format: "2018-03-06" "2018-03-06" ...
##  $ CLIN_EXAMINER       : chr  "vanessa r" "briseida" NA "Vanessa" ...
##  $ PROG_APHASIA        : chr  "N" "U" "N" "N" ...
##  $ AMNESIA             : chr  "N" "U" "N" "N" ...
##  $ LUNG_DX             : chr  "N" "U" "N" "N" ...
##  $ PREV_ARREST         : chr  "N" "U" "N" "N" ...
##  $ SUBSTANCE_ABUSE     : chr  "Y" "U" "N" "N" ...
##  $ SURGERY             : chr  "N" "U" "N" "N" ...
##  $ VAS_DEMENTIA        : chr  "N" "U" "N" "N" ...
##  $ PSY_DISORDER        : chr  "N" "U" "N" "N" ...
##  $ FLUCT_COGNITION     : chr  "N" "U" "N" "N" ...
##  $ DOPAMINE            : chr  "N" "U" "N" "N" ...
##  $ DOPA_CURRENT        : chr  "N" "U" "N" "N" ...
##  $ NEUROLEPTIC         : chr  "N" "U" "N" "U" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_EXAM")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
##   VarNames `Data Type`
##   <chr>    <chr>      
## 1 REFCTR   VARCHAR2(6)
## converted to character
convert2chr <-c("REFCTR")

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE"       "DATE_OF_BIRTH"   "NEURO_EXAM_DATE" "CLIN_EXAM_DATE" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    FORM_DATE DATE_OF_BIRTH NEURO_EXAM_DATE CLIN_EXAM_DATE
## 1 2018-03-06    1950-06-30      2018-03-06     2018-03-06
## 2 2018-03-06    1956-12-21      2018-03-06     2018-03-06
## 3 2018-03-06    1946-10-29      2018-03-06     2018-03-06
## 4 2018-02-19    1946-01-11      2018-02-19     2018-02-19
## 5 2018-03-06    1949-10-06      2018-03-06     2018-03-06
## 6 2018-03-05    1938-11-06      2018-03-05     2018-03-05
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 66 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore NEURO_METHOD and FILLED_OUT_BY


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 10 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_EXAM <- df



ALZ_GAI_SP

df <- ALZ_GAI_SP

info(ALZ_GAI_SP,"SYSIND")
## #obs:19, cols:42, inds:19
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    19 obs. of  42 variables:
##  $ SYSXM               : num  8095923 8103313 8089823 8065953 8066073 ...
##  $ SYSIND              : num  11008753 11147113 11008763 11358523 11369753 ...
##  $ SYSGP               : num  7888993 7922413 7888993 7945143 7951963 ...
##  $ SYSGPSTUDY          : num  1304333 1360523 1304333 1386053 1397073 ...
##  $ SYSINDGP            : num  7763553 7910223 7763563 8127793 8139023 ...
##  $ CGI_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER        : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY              : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER            : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY               : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY            : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER              : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                  : num  87577 87858 87577 88247 88316 ...
##  $ IND                 : num  9000 103 9001 1 1 ...
##  $ REFCTR              : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE           : POSIXct, format: "2022-07-12" "2022-07-14" ...
##  $ EXAMINER            : chr  "mxc2207" "jjs2031" "jjs2031" "mxc2207" ...
##  $ DATE_OF_BIRTH       : POSIXct, format: "1944-10-12" "1939-04-08" ...
##  $ AGE_AT_EXAM         : num  77 83 61 75 85 81 71 80 52 62 ...
##  $ REVIEW_DATE         : logi  NA NA NA NA NA NA ...
##  $ REVIEWER            : logi  NA NA NA NA NA NA ...
##  $ WORRY_ALOT          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIFF_MAKE_DECISION  : num  1 0 0 0 0 0 0 1 0 1 ...
##  $ FEEL_JUMPY          : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ HARD_TO_RELAX       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ CANNOT_ENJOY        : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ THINGS_BOTHER_ME    : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ BUTTERFLIES         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ WORRIER             : num  0 0 0 1 1 1 0 1 1 1 ...
##  $ RIVIAL_THINGS       : num  0 0 0 1 0 0 0 1 1 1 ...
##  $ OFTEN_NERVOUS       : num  1 1 0 0 1 0 0 0 1 0 ...
##  $ THOUGHTS_ANXIOUS    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ UPSET_STOMACH       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ THINK_MYSELF_NERVOUS: num  0 0 0 1 0 0 0 1 0 0 ...
##  $ ANTICIPATE_WORST    : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ FEEL_SHAKY          : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ INTERFERE_WITH_LIFE : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ OVERWHELM           : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ FEEL_GREAT_KNOT     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ MISS_OUT            : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ FEEL_UPSET          : num  1 0 0 0 0 0 0 0 0 0 ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_GAI_SP")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    VARCHAR
## converted to character
convert2chr <-c("REFCTR","REVIEWER")
convert2date <- c("REVIEW_DATE")

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "REVIEW_DATE, ignore it, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2022-07-12    1944-10-12
## 2 2022-07-14    1939-04-08
## 3 2022-07-12    1961-06-12
## 4 2022-04-01    1946-10-06
## 5 2022-03-31    1936-12-21
## 6 2022-03-30    1940-06-12
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## [1] "LSTUDY"   "DB_OWNER" "STUDY"    "SUBSTUDY" "CENTER"   "REFCTR"   "EXAMINER" "REVIEWER"

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER, as I assume we can have multiple examiners


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 31 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;\r\n0;\r\n"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## All numeric values are within valid ranges.


Save Cleaned Data

ALZ_GAI_SP <- df



ALZ_LOAD_COG

df <- ALZ_LOAD_COG

info(ALZ_LOAD_COG,"SYSIND")
## #obs:1006, cols:41, inds:907
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    1006 obs. of  41 variables:
##  $ SYSXM         : num  7540463 7540813 7540903 7540593 7541233 ...
##  $ SYSIND        : num  11006263 11059623 11059693 11048913 11109753 ...
##  $ SYSGP         : num  7888673 7897223 7897223 7896183 7921103 ...
##  $ SYSGPSTUDY    : num  1304013 1312543 1312543 1311503 1359213 ...
##  $ SYSINDGP      : num  7761063 7818553 7818623 7804773 7869273 ...
##  $ CGI_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER  : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY        : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER      : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY         : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY      : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER        : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP            : num  87534 87699 87699 87657 87787 ...
##  $ IND           : num  104 101 108 102 1 1 1 1 1 1 ...
##  $ REFCTR        : logi  NA NA NA NA NA NA ...
##  $ INTERVIEW_DATE: POSIXct, format: "2018-02-21" "2018-02-18" ...
##  $ INTERVIEWER   : chr  "v.rodriguez4" "axr1589" "axr1589" "v.rodriguez4" ...
##  $ DATE_OF_BIRTH : POSIXct, format: "1936-09-20" "1929-10-08" ...
##  $ INTERVIEW_AGE : num  81 88 68 72 67 61 68 68 79 65 ...
##  $ VERSION       : chr  "2.0" "2.0" "2" "2.0" ...
##  $ PHONE         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STORY         : num  6 3 4 6 18 19 9 1 9 12 ...
##  $ DIGFOR        : num  9 5 8 10 9 12 8 8 3 7 ...
##  $ DIGBAK        : num  6 5 7 4 7 6 4 0 2 5 ...
##  $ ANIMALS       : num  16 11 20 14 15 17 7 13 NA 13 ...
##  $ FRUITS        : logi  NA NA NA NA NA NA ...
##  $ VEG           : num  13 6 12 5 14 7 5 7 NA 6 ...
##  $ DIGORD        : num  7 2 4 5 8 8 7 0 NA 7 ...
##  $ DELAY         : num  8 0 6 3 12 17 7 0 NA 5 ...
##  $ HOWWELL       : num  NA NA 1 NA 1 1 1 9 1 1 ...
##  $ HEARIMP       : num  NA NA 2 NA 2 2 2 2 2 2 ...
##  $ STATUS        : num  1 1 1 1 1 1 1 1 4 1 ...
##  $ COMM          : chr  NA NA NA NA ...
##  $ ANIMALS_REP   : logi  NA NA NA NA NA NA ...
##  $ ANIMALS_INT   : logi  NA NA NA NA NA NA ...
##  $ VEG_REP       : logi  NA NA NA NA NA NA ...
##  $ VEG_INT       : logi  NA NA NA NA NA NA ...
##  $ DIGFOR_LEN    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIGBAK_LEN    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DELAY_LEN     : num  NA NA NA NA NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_LOAD_COG")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] 

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 6 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 FRUITS      NUMBER     
## 3 ANIMALS_REP <NA>       
## 4 ANIMALS_INT <NA>       
## 5 VEG_REP     <NA>       
## 6 VEG_INT     <NA>
## converted to character
convert2chr <-c("REFCTR")
convert2num <-c("FRUITS")

## for others, they are missing info in DD, I will leave them for now

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2num] <- lapply(df[convert2chr], as.numeric)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "logical"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##   INTERVIEW_DATE DATE_OF_BIRTH
## 1     2018-02-21    1936-09-20
## 2     2018-02-18    1929-10-08
## 3     2018-02-19    1949-08-01
## 4     2018-02-19    1946-01-11
## 5     2018-03-06    1950-06-30
## 6     2018-03-06    1956-12-21
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"      "logical"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore INTERVIEWER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 26 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
##  [1] NA                                                                     
##  [2] "1 thru 99999;"                                                        
##  [3] "1 thru 9999;"                                                         
##  [4] "1;\r\n2;\r\n8;\r\n9;"                                                 
##  [5] "0 thru 25;\r\n96;\r\n97;\r\n98;\r\n99;"                               
##  [6] "0 thru 12;\r\n96;\r\n97;\r\n98;\r\n99;"                               
##  [7] "0 thru 75;\r\n96;\r\n97;\r\n98;\r\n99;"                               
##  [8] "0 thru 16;\r\n96;\r\n97;\r\n98;\r\n99;"                               
##  [9] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n8;\r\n9;"                               
## [10] "1;\r\n2;"                                                             
## [11] "1;\r\n2;\r\n3;\r\n4;\r\n10;\r\n11;\r\n12;\r\n13;\r\n14;\r\n20;\r\n21;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_LOAD_COG <- df



ALZ_NCRAD

df <- ALZ_NCRAD

info(ALZ_NCRAD,"SYSIND")
## #obs:743, cols:53, inds:742
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    743 obs. of  53 variables:
##  $ SYSXM        : num  7895163 7879963 7879983 7880163 7880193 ...
##  $ SYSIND       : num  11218613 11041143 11041043 11039473 11005233 ...
##  $ SYSGP        : num  7928123 7894373 7894373 7896023 7888553 ...
##  $ SYSGPSTUDY   : num  1366233 1309693 1309693 1311343 1303893 ...
##  $ SYSINDGP     : num  7981883 7797003 7796903 7795213 7760033 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  87998 87502 87502 87506 87501 ...
##  $ IND          : num  1 102 100 1 1 1 1 1 1 1 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ QUALIFY      : chr  "Unknown" "Yes" "Yes" "Yes" ...
##  $ FORM_DATE    : POSIXct, format: "2021-02-01" "2018-04-13" ...
##  $ FILLED_OUT_BY: chr  "sjt82" "v.rodriguez4" "v.rodriguez4" "medical records" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1943-09-22" "1950-10-02" ...
##  $ IN_NCRAD     : chr  NA NA NA NA ...
##  $ SAMPLED      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ EDUC         : num  14 12 16 9 12 1 5 16 3 15 ...
##  $ VISIT        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COMREQ       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NOTDEMCI     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ EVALMETH     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ EVALYR       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CLDEMLEW     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COMDXAD      : logi  NA NA NA NA NA NA ...
##  $ NONADDEM     : logi  NA NA NA NA NA NA ...
##  $ COMDXNAD     : logi  NA NA NA NA NA NA ...
##  $ AAOSYMP      : num  NA NA NA NA 1 1 NA NA NA NA ...
##  $ STROKETY     : logi  NA NA NA NA NA NA ...
##  $ STROKEAGE    : logi  NA NA NA NA NA NA ...
##  $ HYPERAGE     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HEARTAGE     : logi  NA NA NA NA NA NA ...
##  $ DIABETX      : logi  NA NA NA NA NA NA ...
##  $ DIABETAG     : logi  NA NA NA NA NA NA ...
##  $ PDCLINDX     : logi  NA NA NA NA NA NA ...
##  $ PDAGE        : logi  NA NA NA NA NA NA ...
##  $ DEPRTX       : logi  NA NA NA NA NA NA ...
##  $ DEPRAGE      : logi  NA NA NA NA NA NA ...
##  $ HEADAGE      : logi  NA NA NA NA NA NA ...
##  $ ABUSEAGE     : logi  NA NA NA NA NA NA ...
##  $ COM28_36     : logi  NA NA NA NA NA NA ...
##  $ COM_ANY      : chr  NA NA NA NA ...
##  $ CONTROL      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CONTYPE      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RELDEM       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GENRSCH      : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ UNCON_VAL    : logi  NA NA NA NA NA NA ...
##  $ UNCON_UNIT   : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NCRAD")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 18 × 2
##    VarNames   `Data Type`  
##    <chr>      <chr>        
##  1 REFCTR     VARCHAR2(6)  
##  2 COMDXAD    VARCHAR2(255)
##  3 NONADDEM   NUMBER(2)    
##  4 COMDXNAD   VARCHAR2(255)
##  5 STROKETY   NUMBER(1)    
##  6 STROKEAGE  NUMBER(3)    
##  7 HEARTAGE   NUMBER(3)    
##  8 DIABETX    NUMBER(1)    
##  9 DIABETAG   NUMBER(3)    
## 10 PDCLINDX   NUMBER(1)    
## 11 PDAGE      NUMBER(3)    
## 12 DEPRTX     NUMBER(1)    
## 13 DEPRAGE    NUMBER(3)    
## 14 HEADAGE    NUMBER(3)    
## 15 ABUSEAGE   NUMBER(3)    
## 16 COM28_36   VARCHAR2(255)
## 17 UNCON_VAL  NUMBER(3)    
## 18 UNCON_UNIT VARCHAR2(7)
## converted to character
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)] ## 13 vars
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 5 vars

## convert
df[convert2num] <- lapply(df[convert2chr], as.numeric)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    FORM_DATE DATE_OF_BIRTH
## 1 2021-02-01    1943-09-22
## 2 2018-04-13    1950-10-02
## 3 2020-04-23    1949-04-30
## 4 2016-11-17    1933-03-03
## 5 2019-05-22    1937-10-24
## 6 2020-09-17    1935-10-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 14 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 37 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
##  [1] NA                                                                                                                 
##  [2] "1 thru 99999;"                                                                                                    
##  [3] "1 thru 9999;"                                                                                                     
##  [4] "1;\r\n2;"                                                                                                         
##  [5] "0 thru 50;\r\n99;"                                                                                                
##  [6] "1 thru 50;\r\n98;"                                                                                                
##  [7] "1;\r\n2;\r\n3;\r\n9;"                                                                                             
##  [8] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n9;"                                                                           
##  [9] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n9;"                                                                     
## [10] "1930 thru 2020;\r\n9999;"                                                                                         
## [11] "1;\r\n2;\r\n9;"                                                                                                   
## [12] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;\r\n10;\r\n11;\r\n12;\r\n13;\r\n14;\r\n15;\r\n16;\r\n17;\r\n99;"
## [13] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;"                                                               
## [14] "1 thru 80;\r\n999;"                                                                                               
## [15] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n9;"                                                                                 
## [16] "1;\r\n2;\r\n3;\r\n4;\r\n9;\r\n"                                                                                   
## [17] "1;\r\n2; \r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_NCRAD <- df



ALZ_NEURO_CDR

df <- ALZ_NEURO_CDR

info(ALZ_NEURO_CDR,"SYSIND")
## #obs:1221, cols:30, inds:1102
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    1221 obs. of  30 variables:
##  $ SYSXM        : num  7540623 7540773 7546423 7546433 7546863 ...
##  $ SYSIND       : num  11048883 11059623 11044293 11011053 11046873 ...
##  $ SYSGP        : num  7896183 7897223 7894093 7889553 7894313 ...
##  $ SYSGPSTUDY   : num  1311503 1312543 1309413 1304893 1309633 ...
##  $ SYSINDGP     : num  7804743 7818553 7800153 7766073 7802733 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  87657 87699 87604 87580 87620 ...
##  $ IND          : num  1000 101 104 9010 101 106 110 1 102 1 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2017-02-19" "2018-02-18" ...
##  $ EXAMINER     : chr  "axr1589" "axr1589" "avg55" "v.rodriguez4" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1923-04-17" "1929-10-08" ...
##  $ AGE_AT_EXAM  : num  93 88 80 75 76 56 86 84 86 91 ...
##  $ METHOD       : chr  "IP" "IP" "TE" "TE" ...
##  $ RECONSTRUCTED: chr  "U" "U" "N" "N" ...
##  $ CDR_TOTAL    : num  2 2 3 2 1 1 0.5 3 3 2 ...
##  $ MEMORY       : num  2 2 3 2 0.5 2 0.5 3 3 2 ...
##  $ ORIENTATION  : num  2 2 3 1 1 1 0.5 3 3 1 ...
##  $ PROBLEM_SOLVE: num  2 1 3 3 1 2 0 3 3 3 ...
##  $ COM_AFFAIR   : num  2 1 3 2 0.5 1 0 3 3 1 ...
##  $ HOME_HOBBIES : num  2 3 3 3 3 1 0.5 3 3 2 ...
##  $ PERSONAL_CARE: num  3 2 3 2 2 1 0 3 3 2 ...
##  $ CDR_COMM     : chr  NA NA "Too impaired to complete patient portion." "spoke with daughter about her mother, she is not able to keep a conversation. Barely functions with in the hous"| __truncated__ ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NEURO_CDR")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
##   VarNames `Data Type`
##   <chr>    <chr>      
## 1 REFCTR   VARCHAR2(6)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2017-02-19    1923-04-17
## 2 2018-02-18    1929-10-08
## 3 2018-03-16    1937-04-09
## 4 2018-03-20    1942-07-16
## 5 2018-03-06    1942-02-05
## 6 2018-04-03    1961-10-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 18 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                     "1 thru 99999;"        "1 thru 9999;"        
## [4] "0.0 thru 3.4;\r\n-1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_NEURO_CDR <- df



ALZ_NPIQ_CBRS

df <- ALZ_NPIQ_CBRS

info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:123, cols:116, inds:121
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    123 obs. of  116 variables:
##  $ SYSXM         : num  7545813 7557843 7550923 7551043 7558333 ...
##  $ SYSIND        : num  11039643 11039713 11063923 11048283 11039953 ...
##  $ SYSGP         : num  7896143 7896183 7894423 7894423 7896303 ...
##  $ SYSGPSTUDY    : num  1311463 1311503 1309743 1309743 1311623 ...
##  $ SYSINDGP      : num  7795383 7795453 7822853 7804143 7795693 ...
##  $ CGI_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER  : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY        : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER      : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY         : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY      : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER        : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP            : num  87654 87657 87650 87650 87663 ...
##  $ IND           : num  1 1 110 106 1 ...
##  $ REFCTR        : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE     : POSIXct, format: "2018-03-19" "2018-04-17" ...
##  $ EXAMINER      : chr  "avg55" "axr1589" "axr1589" "axr1589" ...
##  $ DATE_OF_BIRTH : POSIXct, format: "1933-06-05" "1947-04-23" ...
##  $ AGE_AT_EXAM   : num  84 70 86 82 87 85 77 75 80 74 ...
##  $ NPIQINF       : chr  "2" "1" "1" "1" ...
##  $ NPIQINF_PRO   : chr  NA NA NA NA ...
##  $ NPIQINF_OTH   : chr  NA NA NA NA ...
##  $ NPIQINFA      : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ NPIQINFB      : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ NPIQTYPE      : num  2 1 1 1 1 2 1 1 1 1 ...
##  $ AGIT          : num  1 0 0 0 0 0 0 1 0 1 ...
##  $ AGITSEV       : num  1 NA NA NA NA NA NA 2 NA 1 ...
##  $ AGITATION_DIST: num  NA NA NA NA NA NA NA 5 NA 1 ...
##  $ DEPD          : num  1 0 0 0 0 1 0 0 1 0 ...
##  $ DEPDSEV       : num  1 NA NA NA NA 3 NA NA 2 NA ...
##  $ DEPRESS_DIST  : num  NA NA NA NA NA 5 NA NA 2 NA ...
##  $ ANX           : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ ANXSEV        : num  NA NA NA NA NA NA NA NA NA 2 ...
##  $ ANXIETY_DIST  : num  NA NA NA NA NA NA NA NA NA 4 ...
##  $ ELAT          : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ ELATSEV       : num  NA NA NA NA NA NA NA 1 NA NA ...
##  $ ELATION_DIST  : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ APA           : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ APASEV        : num  NA NA NA NA 3 NA NA NA 2 NA ...
##  $ APATHY_DIST   : num  NA NA NA NA 0 NA NA NA 2 NA ...
##  $ DISN          : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ DISNSEV       : num  NA NA NA NA NA NA NA NA NA 2 ...
##  $ DISINHIB_DIST : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ IRR           : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ IRRSEV        : num  NA NA NA NA NA NA NA 3 NA 2 ...
##  $ IRRIT_DIST    : num  NA NA NA NA NA NA NA 5 NA 5 ...
##  $ MOT           : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ MOTSEV        : num  1 NA NA NA NA NA NA NA NA 3 ...
##  $ MOTOR_DIST    : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ NITE          : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ NITESEV       : num  NA 2 NA NA NA NA NA NA NA 1 ...
##  $ NIGHTTIME_DIST: num  NA 2 NA NA NA NA NA NA NA 0 ...
##  $ APP           : num  1 1 1 0 1 0 0 0 0 1 ...
##  $ APPSEV        : num  2 2 1 NA 3 NA NA NA NA 3 ...
##  $ APPETITE_DIST : num  NA 0 0 NA 5 NA NA NA NA 2 ...
##  $ DEL           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DELSEV        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DELUSION_DIST : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PARA          : num  0 0 0 0 0 0 NA NA NA 0 ...
##  $ PARAC         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PARAB         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PARAD         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HALL          : num  1 1 0 0 0 0 0 0 0 0 ...
##  $ HALLSEV       : num  1 1 NA NA NA NA NA NA NA NA ...
##  $ HALLUCIN_DIST : num  NA 0 NA NA NA NA NA NA NA NA ...
##  $ AUDHALL       : num  4 1 0 0 8 0 NA NA NA 0 ...
##  $ AUDHALLC      : num  NA NA NA NA 9 NA NA NA NA NA ...
##  $ AUDHALLB      : num  1 1 NA NA 1 NA NA NA NA NA ...
##  $ AUDHALLD      : num  0 0 NA NA 0 NA NA NA NA NA ...
##  $ VISHALL       : num  0 1 NA NA 8 9 NA NA NA NA ...
##  $ VISHALLB      : num  NA 1 NA NA 1 NA NA NA NA NA ...
##  $ VISHALLC      : num  NA NA NA NA 9 NA NA NA NA NA ...
##  $ VISHALLD      : num  NA 0 NA NA 0 NA NA NA NA NA ...
##  $ MISIDP        : num  2 0 9 9 9 9 NA NA NA 9 ...
##  $ MISIDPB       : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ MISIDPC       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MISIDPD       : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ MISIDSEL      : num  0 0 9 9 9 9 NA NA NA 0 ...
##  $ MISIDSB       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MISIDSC       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MISIDSD       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MISIDT        : num  4 0 9 9 9 9 NA NA NA 0 ...
##  $ MISIDTB       : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ MISIDTC       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MISIDTD       : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ INFID         : num  0 0 9 9 9 9 NA NA NA 9 ...
##  $ INFIDB        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ INFIDC        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ INFIDD        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ABND          : num  0 0 9 9 9 9 NA NA NA 0 ...
##  $ ABNDB         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ABNDC         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ABNDD         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ IMP           : num  0 0 9 9 9 9 NA NA NA 0 ...
##  $ IMPB          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ IMPC          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ IMPD          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TVR           : num  0 0 9 9 9 9 NA NA NA 0 ...
##  $ TVRB          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TVRC          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TVRD          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OPIH          : num  4 0 9 9 9 9 NA NA NA 0 ...
##  $ OPIHB         : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ OPIHC         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OPIHD         : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ DPSA          : num  4 0 9 9 9 9 NA NA NA 0 ...
##  $ DPSAB         : num  9 NA NA NA NA NA NA NA NA NA ...
##  $ DPSAC         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DPSAD         : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ HNH           : num  2 0 9 9 9 9 NA NA NA 0 ...
##  $ HNHB          : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ HNHC          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HNHD          : num  0 NA NA NA NA NA NA NA NA NA ...
##  $ INTQUAL       : num  0 0 0 0 0 0 NA NA NA 0 ...
##  $ NOTES         : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NPIQ_CBRS")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
##   VarNames `Data Type`
##   <chr>    <chr>      
## 1 REFCTR   VARCHAR2(6)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2018-03-19    1933-06-05
## 2 2018-04-17    1947-04-23
## 3 2018-04-03    1931-07-01
## 4 2018-04-03    1935-05-25
## 5 2018-04-24    1930-06-19
## 6 2018-04-25    1933-03-11
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore NPIQINF, since it can be multiple values as specified in the DD


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 103 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
##  [1] NA                                      
##  [2] "1 thru 99999;"                         
##  [3] "1 thru 9999;"                          
##  [4] "0;\r\n1;"                              
##  [5] "1;\r\n2;\r\n3;"                        
##  [6] "1;\r\n2;"                              
##  [7] "1;\r\n0;"                              
##  [8] "0;\r\n1;\r\n2;\r\n3;\r\n4;\r\n5;"      
##  [9] "1;\r\n2;\r\n3;\r\n4;\r\n9;\r\n0;\r\n8;"
## [10] "1;\r\n2;\r\n3;\r\n4;\r\n9;"            
## [11] "0;\r\n1;\r\n9;"                        
## [12] "0;\r\n1;\r\n2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_NPIQ_CBRS <- df



ALZ_RPFQ

df <- ALZ_RPFQ

info(ALZ_RPFQ,"SYSIND")
## #obs:132, cols:67, inds:132
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    132 obs. of  67 variables:
##  $ SYSXM              : num  7895173 8010153 8011643 8012863 8001143 ...
##  $ SYSIND             : num  11218613 11109763 11447143 11458753 11248653 ...
##  $ SYSGP              : num  7928123 7921113 7968293 7974313 7931713 ...
##  $ SYSGPSTUDY         : num  1366233 1359223 1413403 1419423 1370023 ...
##  $ SYSINDGP           : num  7981883 7869283 8216213 8227823 8012383 ...
##  $ CGI_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER       : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY             : chr  "ADCRLPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER           : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY              : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY           : chr  "ADCRLPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER             : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                 : num  87998 87788 88462 88466 88118 ...
##  $ IND                : num  1 1 100 1 1 1 115 100 1 1 ...
##  $ REFCTR             : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE          : POSIXct, format: "2021-02-01" "2021-11-15" ...
##  $ EXAMINER           : chr  "sjt82" "jjs2031" "jjs2031" "mxc2207" ...
##  $ DATE_OF_BIRTH      : POSIXct, format: "1943-09-22" "1956-12-21" ...
##  $ AGE_AT_EXAM        : num  77 64 70 72 77 63 76 72 83 71 ...
##  $ REVIEW_DATE        : logi  NA NA NA NA NA NA ...
##  $ REVIEWER           : logi  NA NA NA NA NA NA ...
##  $ SMOKE              : num  2 1 1 1 1 2 2 1 2 1 ...
##  $ SMOKE_AGE_START    : num  NA 15 15 16 12 NA NA 18 NA 20 ...
##  $ SMOKE_CURR         : num  NA 2 2 2 2 NA NA 2 NA 2 ...
##  $ SMOKE_AGE_STOP     : num  NA 64 40 68 73 NA NA 40 NA 50 ...
##  $ PREGNANCIES        : num  NA NA 6 2 NA NA 1 NA NA NA ...
##  $ LIVE_KIDS          : num  NA NA 4 2 NA NA 1 NA NA NA ...
##  $ HRT                : num  NA NA 2 2 NA NA 2 NA 9 NA ...
##  $ HRT_AGE_START      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HRT_AGE_STOP       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HRT_YEARS          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYSTERECTOMY       : num  NA NA 2 1 NA NA 2 NA 9 NA ...
##  $ HYSTERECTOMY_AGE   : num  NA NA NA 48 NA NA NA NA NA NA ...
##  $ OVARIES_RMV        : num  NA NA 2 2 NA NA 2 NA 9 NA ...
##  $ OVARIES_RMV_AGE    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OVARIES_RMV_BOTH   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HRT_OVR_RMV        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PHYSICAL_ACTIVITIES: num  NA 1 0 0 0 1 0 0 0 0 ...
##  $ NOPA_REASON        : num  NA NA 2 0 1 NA 0 1 1 1 ...
##  $ VA_PAST2W          : num  NA 0 0 0 0 0 0 0 NA 1 ...
##  $ VA_PAST2W_TIMES    : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ VA_PAST2W_MINS     : num  NA NA NA NA NA NA NA NA NA 60 ...
##  $ MA_PAST2W          : num  NA 0 0 0 0 0 0 0 NA 0 ...
##  $ MA_PAST2W_TIMES    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MA_PAST2W_MINS     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LA_PAST2W          : num  NA 1 0 0 0 1 0 0 NA 1 ...
##  $ LA_PAST2W_TIMES    : num  NA 2 NA NA NA 14 NA NA NA 1 ...
##  $ LA_PAST2W_MINS     : num  NA 15 NA NA NA 30 NA NA NA 60 ...
##  $ VA_AR13            : num  NA 1 0 1 1 1 0 0 NA 1 ...
##  $ VA_AR13_LEVEL      : chr  NA "V" NA "A" ...
##  $ MA_AR13            : num  NA 1 1 1 1 1 0 0 NA 1 ...
##  $ MA_AR13_LEVEL      : chr  NA "V" "V" "A" ...
##  $ LA_AR13            : num  NA 1 1 1 1 1 0 0 NA 1 ...
##  $ LA_AR13_LEVEL      : chr  NA "V" "V" "A" ...
##  $ VA_AR24            : num  NA 0 0 1 1 0 0 0 NA 1 ...
##  $ VA_AR24_LEVEL      : chr  NA NA NA "V" ...
##  $ MA_AR24            : num  NA 1 1 1 1 0 0 0 NA 1 ...
##  $ MA_AR24_LEVEL      : chr  NA "F" "V" "V" ...
##  $ LA_AR24            : num  NA 1 1 1 1 1 0 0 NA 1 ...
##  $ LA_AR24_LEVEL      : chr  NA "F" "V" "V" ...
##  $ VA_AR50            : num  NA 0 0 0 0 0 0 0 NA 1 ...
##  $ VA_AR50_LEVEL      : chr  NA NA NA NA ...
##  $ MA_AR50            : num  NA 0 0 0 1 0 0 0 NA 1 ...
##  $ MA_AR50_LEVEL      : chr  NA NA NA NA ...
##  $ LA_AR50            : num  NA 1 1 1 1 1 0 0 NA 1 ...
##  $ LA_AR50_LEVEL      : chr  NA "V" "F" "V" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_RPFQ")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    VARCHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2021-02-01    1943-09-22
## 2 2021-11-15    1956-12-21
## 3 2021-08-18    1951-03-04
## 4 2021-12-06    1949-06-04
## 5 2021-09-09    1944-01-03
## 6 2021-11-15    1958-03-03
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 47 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                                 "1 thru 99999;"                   
## [3] "1 thru 9999;"                     "1;\r\n2;\r\n9;"                  
## [5] "0;\r\n1;\r\n9;"                   "0;\r\n1;\r\n2;\r\n3;\r\n4;\r\n5;"
## [7] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_RPFQ <- df



ALZ_SCREENING

df <- ALZ_SCREENING

info(ALZ_SCREENING,"SYSIND")
## #obs:279, cols:49, inds:272
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    279 obs. of  49 variables:
##  $ SYSXM            : num  7178373 7178243 7178253 7178263 7178273 ...
##  $ SYSIND           : num  1.1e+07 1.1e+07 1.1e+07 1.1e+07 1.1e+07 ...
##  $ SYSGP            : num  7894403 7894393 7896003 7896013 7896093 ...
##  $ SYSGPSTUDY       : num  1309723 1309713 1311323 1311333 1311413 ...
##  $ SYSINDGP         : num  7793363 7793333 7795173 7795203 7795323 ...
##  $ CGI_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER     : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY           : chr  "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER         : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY            : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY         : chr  "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ CENTER           : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP               : num  87648 87503 87504 87505 87512 ...
##  $ IND              : num  101 1 1 9000 1 ...
##  $ REFCTR           : logi  NA NA NA NA NA NA ...
##  $ FORM_DATE        : POSIXct, format: "2017-07-18" "2017-07-14" ...
##  $ FILLED_OUT_BY    : chr  "axr1589" "axr1589" "axr1589" "axr1589" ...
##  $ DATE_OF_BIRTH    : POSIXct, format: "1948-02-01" "1939-01-13" ...
##  $ LUMBAR_YES_NO    : chr  "N" "N" "N" "N" ...
##  $ LUMBAR_DATE      : POSIXct, format: NA NA ...
##  $ LUMBAR_NO_DATE   : chr  NA NA NA NA ...
##  $ LUMBAR_PUNCTURE  : chr  NA NA NA NA ...
##  $ BRAIN_MRI_YES_NO : chr  "N" "Y" "N" "N" ...
##  $ BRAIN_MRI_DATE   : POSIXct, format: NA NA ...
##  $ BRAIN_MRI_NO_DATE: chr  NA NA NA NA ...
##  $ BRAIN_MRI        : chr  NA "NL" NA NA ...
##  $ BRAIN_CT_YES_NO  : chr  "N" "N" "N" "N" ...
##  $ BRAIN_CT_DATE    : POSIXct, format: NA NA ...
##  $ BRAIN_CT_NO_DATE : chr  NA NA NA NA ...
##  $ BRAIN_CT         : chr  NA NA NA NA ...
##  $ EEG_YES_NO       : chr  "N" "N" "N" "N" ...
##  $ EEG_DATE         : POSIXct, format: NA NA ...
##  $ EEG_NO_DATE      : chr  NA NA NA NA ...
##  $ EEG              : chr  NA NA NA NA ...
##  $ PET_SP_YES_NO    : chr  "N" "N" "N" "N" ...
##  $ PET_SP_DATE      : POSIXct, format: NA NA ...
##  $ PET_SP_NO_DATE   : chr  NA NA NA NA ...
##  $ PET_SP           : chr  NA NA NA NA ...
##  $ BRAIN_BIO_YES_NO : chr  "N" "N" "N" "N" ...
##  $ BRAIN_BIO_DATE   : logi  NA NA NA NA NA NA ...
##  $ BRAIN_BIO_NO_DATE: logi  NA NA NA NA NA NA ...
##  $ BRAIN_BIO        : logi  NA NA NA NA NA NA ...
##  $ LUMB_NOTES       : logi  NA NA NA NA NA NA ...
##  $ BRNMRI_NOTES     : logi  NA NA NA NA NA NA ...
##  $ BRNCT_NOTES      : logi  NA NA NA NA NA NA ...
##  $ EEG_NOTES        : logi  NA NA NA NA NA NA ...
##  $ PETSP_NOTES      : logi  NA NA NA NA NA NA ...
##  $ BRNBIO_NOTES     : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_SCREENING")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 10 × 2
##    VarNames          `Data Type`   
##    <chr>             <chr>         
##  1 REFCTR            VARCHAR2(6)   
##  2 BRAIN_BIO_DATE    DATE          
##  3 BRAIN_BIO_NO_DATE CHAR(2)       
##  4 BRAIN_BIO         CHAR(2)       
##  5 LUMB_NOTES        VARCHAR2(4000)
##  6 BRNMRI_NOTES      VARCHAR2(4000)
##  7 BRNCT_NOTES       VARCHAR2(4000)
##  8 EEG_NOTES         VARCHAR2(4000)
##  9 PETSP_NOTES       VARCHAR2(4000)
## 10 BRNBIO_NOTES      VARCHAR2(4000)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("DATE", dfDD$`Data Type`)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE"      "DATE_OF_BIRTH"  "LUMBAR_DATE"    "BRAIN_MRI_DATE" "BRAIN_CT_DATE"  "EEG_DATE"       "PET_SP_DATE"  

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "BRAIN_BIO_DATE", ignore it, it has been corrected in previous step
## [1] "BRAIN_BIO_DATE"
head(df[,datecols])
##    FORM_DATE DATE_OF_BIRTH LUMBAR_DATE BRAIN_MRI_DATE BRAIN_CT_DATE EEG_DATE
## 1 2017-07-18    1948-02-01        <NA>           <NA>          <NA>     <NA>
## 2 2017-07-14    1939-01-13        <NA>           <NA>          <NA>     <NA>
## 3 2017-07-14    1944-10-03        <NA>           <NA>          <NA>     <NA>
## 4 2017-07-14    1960-10-23        <NA>           <NA>          <NA>     <NA>
## 5 2017-07-14    1940-11-18        <NA>           <NA>          <NA>     <NA>
## 6 2017-07-14    1946-10-04        <NA>           <NA>          <NA>     <NA>
##   PET_SP_DATE
## 1        <NA>
## 2        <NA>
## 3        <NA>
## 4        <NA>
## 5        <NA>
## 6        <NA>
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 31 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 10 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_SCREENING <- df



ALZ_SCREENING_RC

df <- ALZ_SCREENING_RC

info(ALZ_SCREENING_RC,"SYSIND")
## #obs:556, cols:61, inds:552
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    556 obs. of  61 variables:
##  $ SYSXM             : num  8258773 8258813 8260093 8277633 8278003 ...
##  $ SYSIND            : num  11037673 11369813 11362953 11638763 11621333 ...
##  $ SYSGP             : num  7894423 7952013 7946353 8007323 8006293 ...
##  $ SYSGPSTUDY        : num  1309743 1397123 1387463 1454033 1453003 ...
##  $ SYSINDGP          : num  7793413 8139083 8132223 8407833 8390403 ...
##  $ CGI_ORDER         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER      : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY            : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ DB_OWNER          : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY             : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY          : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ CENTER            : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                : num  87650 88301 87545 104540 104528 ...
##  $ IND               : num  9000 1 106 1 1 ...
##  $ REFCTR            : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE         : POSIXct, format: "2023-10-24" "2024-02-13" ...
##  $ EXAMINER          : chr  "gsv32" "jjs2031" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH     : POSIXct, format: "1954-10-29" "1947-05-13" ...
##  $ AGE_AT_EXAM       : num  68 76 66 86 86 67 60 81 77 62 ...
##  $ REVIEW_DATE       : logi  NA NA NA NA NA NA ...
##  $ REVIEWER          : logi  NA NA NA NA NA NA ...
##  $ LUMB_YN           : chr  "N" "N" "N" "N" ...
##  $ LUMB_DT           : POSIXct, format: NA NA ...
##  $ LUMB_PUNC         : chr  NA NA NA NA ...
##  $ LUMB_NOTES        : chr  NA NA NA NA ...
##  $ BRNMRI_YN         : chr  "Y" "N" "Y" "N" ...
##  $ BRNMRI_DT         : POSIXct, format: "2017-10-01" NA ...
##  $ BRAIN_MRI         : chr  "NL" NA "AC" NA ...
##  $ BRNMRI_NOTES      : chr  NA NA "NO DATE AVAILABLE" NA ...
##  $ BRNCT_YN          : chr  "Y" "N" "N" "N" ...
##  $ BRNCT_DT          : POSIXct, format: "2017-10-01" NA ...
##  $ BRAIN_CT          : chr  "NL" NA NA NA ...
##  $ BRNCT_NOTES       : chr  NA NA NA NA ...
##  $ EEG_YN            : chr  "Y" "N" "N" "N" ...
##  $ EEG_DT            : POSIXct, format: "2017-10-01" NA ...
##  $ EEG               : chr  "NL" NA NA NA ...
##  $ EEG_NOTES         : chr  NA NA NA NA ...
##  $ PETSP_YN          : chr  "N" "N" "N" "N" ...
##  $ PETSP_DT          : POSIXct, format: NA NA ...
##  $ PET_SPECT         : chr  NA NA NA NA ...
##  $ PETSP_NOTES       : chr  NA NA NA NA ...
##  $ BRNBIO_YN         : chr  "N" "N" "N" "N" ...
##  $ BRNBIO_DT         : logi  NA NA NA NA NA NA ...
##  $ BRAIN_BIO         : logi  NA NA NA NA NA NA ...
##  $ BRNBIO_NOTES      : logi  NA NA NA NA NA NA ...
##  $ PRIOR_SCORE_MMSE1 : logi  NA NA NA NA NA NA ...
##  $ DATE_MMSE1        : logi  NA NA NA NA NA NA ...
##  $ PRIOR_SCORE_MOCA1 : logi  NA NA NA NA NA NA ...
##  $ DATE_MOCA1        : logi  NA NA NA NA NA NA ...
##  $ PRIOR_SC_BROOKE1  : logi  NA NA NA NA NA NA ...
##  $ DATE_BROOKE1      : logi  NA NA NA NA NA NA ...
##  $ PRIOR_SC_CHIF1    : logi  NA NA NA NA NA NA ...
##  $ DATE_CHIF1        : logi  NA NA NA NA NA NA ...
##  $ PRIOR_SC_WORDLIST1: logi  NA NA NA NA NA NA ...
##  $ DATE_WORDLIST1    : logi  NA NA NA NA NA NA ...
##  $ OTHER_TEST1       : logi  NA NA NA NA NA NA ...
##  $ DATE_OTHER_TEST1  : logi  NA NA NA NA NA NA ...
##  $ PRIOR_CLASSIF1    : logi  NA NA NA NA NA NA ...
##  $ PRIOR_ASSESS_NOTE1: logi  NA NA NA NA NA NA ...
##  $ NOTE_ALZ_SCREEN   : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_SCREENING_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 20 × 2
##    VarNames           `Data Type`   
##    <chr>              <chr>         
##  1 REFCTR             VARCHAR2(6)   
##  2 REVIEW_DATE        date          
##  3 REVIEWER           VARCHAR       
##  4 BRNBIO_DT          DATE          
##  5 BRAIN_BIO          CHAR(2)       
##  6 BRNBIO_NOTES       VARCHAR2(4000)
##  7 PRIOR_SCORE_MMSE1  NUMBER(3)     
##  8 DATE_MMSE1         DATE          
##  9 PRIOR_SCORE_MOCA1  NUMBER(3)     
## 10 DATE_MOCA1         DATE          
## 11 PRIOR_SC_BROOKE1   NUMBER(3)     
## 12 DATE_BROOKE1       DATE          
## 13 PRIOR_SC_CHIF1     NUMBER(3)     
## 14 DATE_CHIF1         DATE          
## 15 PRIOR_SC_WORDLIST1 NUMBER(3)     
## 16 DATE_WORDLIST1     DATE          
## 17 OTHER_TEST1        NUMBER(3)     
## 18 DATE_OTHER_TEST1   DATE          
## 19 PRIOR_CLASSIF1     VARCHAR2(50)  
## 20 PRIOR_ASSESS_NOTE1 VARCHAR2(150)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,,ignore.case = T)] ## 6 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)] ## 8 vars
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)] ## 6 vars

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2chr], as.numeric)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" "LUMB_DT"       "BRNMRI_DT"     "BRNCT_DT"      "EEG_DT"        "PETSP_DT" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols)
## [1] "REVIEW_DATE"      "BRNBIO_DT"        "DATE_MMSE1"       "DATE_MOCA1"      
## [5] "DATE_BROOKE1"     "DATE_CHIF1"       "DATE_WORDLIST1"   "DATE_OTHER_TEST1"
# [1] "REVIEW_DATE"      "BRNBIO_DT"        "DATE_MMSE1"       "DATE_MOCA1"       "DATE_BROOKE1"     "DATE_CHIF1"      
# [7] "DATE_WORDLIST1"   "DATE_OTHER_TEST1"
## these variables have been corrected in previous step

head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH LUMB_DT  BRNMRI_DT   BRNCT_DT     EEG_DT PETSP_DT
## 1 2023-10-24    1954-10-29    <NA> 2017-10-01 2017-10-01 2017-10-01     <NA>
## 2 2024-02-13    1947-05-13    <NA>       <NA>       <NA>       <NA>     <NA>
## 3 2024-02-20    1957-08-05    <NA>       <NA>       <NA>       <NA>     <NA>
## 4 2023-09-13    1937-08-13    <NA>       <NA>       <NA>       <NA>     <NA>
## 5 2023-05-09    1936-05-22    <NA>       <NA>       <NA>       <NA>     <NA>
## 6 2023-08-16    1956-01-09    <NA>       <NA>       <NA>       <NA>     <NA>
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 29 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 17 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_SCREENING_RC <- df



ALZ_STICK_D_RC

df <- ALZ_STICK_D_RC

info(ALZ_STICK_D_RC,"SYSIND")
## #obs:430, cols:46, inds:428
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    430 obs. of  46 variables:
##  $ SYSXM                        : num  8275873 8258963 8259113 8277733 8277873 ...
##  $ SYSIND                       : num  11160523 11369813 11037673 11435853 11638763 ...
##  $ SYSGP                        : num  7923793 7952013 7894423 7962813 8007323 ...
##  $ SYSGPSTUDY                   : num  1361903 1397123 1309743 1407923 1454033 ...
##  $ SYSINDGP                     : num  7923633 8139083 7793413 8205123 8407833 ...
##  $ CGI_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER                 : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                       : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER                     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                     : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER                       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                           : num  87883 88301 87650 88452 104540 ...
##  $ IND                          : num  1 1 9000 1 1 106 9000 1 1 1 ...
##  $ REFCTR                       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                    : POSIXct, format: "2024-02-14" "2024-02-13" ...
##  $ EXAMINER                     : chr  "gsv32" "jjs2031" "gsv32" "gsv32" ...
##  $ DATE_OF_BIRTH                : POSIXct, format: "1939-03-20" "1947-05-13" ...
##  $ AGE_AT_EXAM                  : num  84 76 68 81 86 66 56 79 79 77 ...
##  $ REVIEW_DATE                  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                     : logi  NA NA NA NA NA NA ...
##  $ DRSD_I                       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ DRSD_II                      : num  1 1 1 1 1 0 1 1 1 1 ...
##  $ DRSD_III                     : num  0 1 0 1 1 0 1 1 0 1 ...
##  $ DRSD_IV                      : num  1 1 1 0 1 1 0 1 0 1 ...
##  $ DRSD_V                       : num  1 1 1 0 1 0 0 1 0 1 ...
##  $ DRSD_VI                      : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ DRSD_VII                     : num  0 1 0 0 0 1 0 0 0 1 ...
##  $ DRSD_VIII                    : num  0 1 0 0 0 1 0 0 0 1 ...
##  $ DRSD_IX                      : num  0 0 0 0 0 1 0 0 0 1 ...
##  $ DRSD_X                       : num  1 0 1 1 0 0 0 1 1 1 ...
##  $ DRSD_XI                      : num  1 0 1 0 0 0 0 1 1 1 ...
##  $ DRSD_XII                     : num  1 0 0 0 0 0 0 1 1 1 ...
##  $ COMMENTS_DRSD                : chr  "did not remember chevron figure" NA NA "unable to remember figures: triangle with stem and chevron" ...
##  $ STATUS_DRSD                  : logi  NA NA NA NA NA NA ...
##  $ TOTAL_SCORE_ITEM1_DRSD       : num  2 3 2 3 3 1 3 3 2 3 ...
##  $ TOTAL_SCORE_ITEM1_DRSD_STATUS: logi  NA NA NA NA NA NA ...
##  $ TOTAL_SCORE_ITEM2_DRSD       : num  2 2 2 0 2 1 0 3 0 3 ...
##  $ TOTAL_SCORE_ITEM2_DRSD_STATUS: logi  NA NA NA NA NA NA ...
##  $ TOTAL_SCORE_ITEM3_DRSD       : num  0 2 0 0 0 3 0 0 0 3 ...
##  $ TOTAL_SCORE_ITEM3_DRSD_STATUS: logi  NA NA NA NA NA NA ...
##  $ TOTAL_SCORE_ITEM4_DRSD       : num  3 0 2 1 0 0 0 3 3 3 ...
##  $ TOTAL_SCORE_ITEM4_DRSD_STATUS: logi  NA NA NA NA NA NA ...
##  $ SUM_TOTAL_SCORE_DRSD         : num  7 7 6 4 5 5 3 9 5 12 ...
##  $ SUM_TOTAL_SCORE_DRSD_STATUS  : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "ALZ_STICK_D_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 8 × 2
##   VarNames                      `Data Type`
##   <chr>                         <chr>      
## 1 REFCTR                        VARCHAR2(6)
## 2 REVIEW_DATE                   date       
## 3 REVIEWER                      VARCHAR    
## 4 STATUS_DRSD                   NUMBER(3)  
## 5 TOTAL_SCORE_ITEM1_DRSD_STATUS CHAR       
## 6 TOTAL_SCORE_ITEM2_DRSD_STATUS CHAR       
## 7 TOTAL_SCORE_ITEM3_DRSD_STATUS CHAR       
## 8 TOTAL_SCORE_ITEM4_DRSD_STATUS CHAR
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## STATUS_DRSD

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date))

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2024-02-13    1947-05-13
## 3 2023-10-24    1954-10-29
## 4 2024-02-15    1942-09-30
## 5 2023-09-13    1937-08-13
## 6 2024-02-20    1957-08-05
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 14 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 29 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                             "1 thru 99999;"               
## [3] "1 thru 9999;"                 "1;\r\n0;\r\n"                
## [5] "995;\r\n996;\r\n997;\r\n998;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

ALZ_STICK_D_RC <- df



B4_CDR_RC

df <- B4_CDR_RC

info(B4_CDR_RC,"SYSIND")
## #obs:599, cols:38, inds:592
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    599 obs. of  38 variables:
##  $ SYSXM              : num  8275843 8276023 8276613 8258933 8259053 ...
##  $ SYSIND             : num  11160523 11620763 11369703 11369813 11037673 ...
##  $ SYSGP              : num  7923793 8005723 7951913 7952013 7894423 ...
##  $ SYSGPSTUDY         : num  1361903 1452433 1397023 1397123 1309743 ...
##  $ SYSINDGP           : num  7923633 8389833 8138973 8139083 7793413 ...
##  $ CGI_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER       : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY             : chr  "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER           : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY              : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY           : chr  "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
##  $ CENTER             : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                 : num  87883 104457 88299 88301 87650 ...
##  $ IND                : num  1 1 1 1 9000 1 1 1 1 106 ...
##  $ REFCTR             : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE          : POSIXct, format: "2024-02-14" "2023-04-17" ...
##  $ EXAMINER           : chr  "gsv32" "sjt82" "gsv32" "jjs2031" ...
##  $ DATE_OF_BIRTH      : POSIXct, format: "1939-03-20" "1946-12-19" ...
##  $ AGE_AT_EXAM        : num  84 76 79 76 68 73 81 86 86 66 ...
##  $ REVIEW_DATE        : logi  NA NA NA NA NA NA ...
##  $ REVIEWER           : logi  NA NA NA NA NA NA ...
##  $ METHOD_CDR         : chr  "IP" "IP" "IP" "IP" ...
##  $ MEMO_NOTE          : chr  NA NA NA NA ...
##  $ MEMO_SC            : num  0.5 1 0.5 0 0 0 0 0.5 0 1 ...
##  $ ORIENT_NOTE        : chr  NA NA NA NA ...
##  $ ORIENT_SC          : num  0 0.5 0 0 0 0 0 0 0 1 ...
##  $ P_SOLVE_NOTE       : chr  NA NA NA NA ...
##  $ P_SOLVE_SC         : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ COM_AFFAIR_NOTE    : chr  NA NA NA NA ...
##  $ COM_AFFAIR_SC      : num  0 1 0 0 0 0 0 0 0 0.5 ...
##  $ HOME_HOB_NOTES     : chr  NA NA NA NA ...
##  $ HOME_HOB_SC        : num  0 1 0 0 0 0 0 0 0 0.5 ...
##  $ P_CARE_NOTE        : chr  NA NA NA NA ...
##  $ P_CARE_SC          : num  0 1 0 0 0 0 0 0 0 0.5 ...
##  $ CDR_TOTAL_CDR      : num  5 1 5 0 0 0 0 5 0 5 ...
##  $ SUM_BOXSCORE       : num  0.5 5.5 0.5 0 0 0 0 0.5 0 4.5 ...
##  $ SUM_BOXSCORE_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "B4_CDR_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    VARCHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2023-04-17    1946-12-19
## 3 2024-02-13    1944-09-22
## 4 2024-02-13    1947-05-13
## 5 2023-10-24    1954-10-29
## 6 2023-05-15    1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 16 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 19 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                             "1 thru 99999;"               
## [3] "1 thru 9999;"                 "0;\r\n0.5;\r\n1;\r\n2;\r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

B4_CDR_RC <- df



B5_NPIQ_RC

df <- B5_NPIQ_RC

info(B5_NPIQ_RC,"SYSIND")
## #obs:305, cols:38, inds:304
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    305 obs. of  38 variables:
##  $ SYSXM        : num  8275943 8258983 8277543 8260623 8261293 ...
##  $ SYSIND       : num  11160523 11369813 11620763 11163453 11638403 ...
##  $ SYSGP        : num  7923793 7952013 8005723 7924953 8006953 ...
##  $ SYSGPSTUDY   : num  1361903 1397123 1452433 1363063 1453663 ...
##  $ SYSINDGP     : num  7923633 8139083 8389833 7926663 8407473 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCRLPRADI" "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCRLPRADI" "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  87883 88301 104457 87923 104556 ...
##  $ IND          : num  1 1 1 9000 1 1 1 1 1 101 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2024-02-14" "2024-02-13" ...
##  $ EXAMINER     : chr  "gsv32" "jjs2031" "sjt82" "gsv32" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1939-03-20" "1947-05-13" ...
##  $ AGE_AT_EXAM  : num  84 76 76 56 79 71 74 64 86 70 ...
##  $ REVIEW_DATE  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER     : logi  NA NA NA NA NA NA ...
##  $ NPIQINF      : num  1 2 3 3 2 2 3 1 3 3 ...
##  $ NPIQINF_OTH  : chr  NA NA "center caretaker" "cousin" ...
##  $ NPIQTYPE     : num  1 NA NA 1 NA NA 1 NA NA 1 ...
##  $ DELSEV       : num  0 0 2 0 0 0 0 0 0 0 ...
##  $ HALLSEV      : num  1 0 2 0 0 0 0 0 0 0 ...
##  $ AGITSEV      : num  0 0 0 0 0 2 0 0 1 0 ...
##  $ DEPDSEV      : num  1 0 0 1 1 2 0 0 0 0 ...
##  $ ANXSEV       : num  1 0 0 0 2 2 0 0 1 0 ...
##  $ ELATSEV      : num  0 0 0 0 0 2 0 0 1 0 ...
##  $ APASEV       : num  0 0 0 0 0 3 0 0 0 0 ...
##  $ DISNSEV      : num  0 0 0 0 0 2 0 0 0 0 ...
##  $ IRRSEV       : num  0 0 0 1 0 2 0 0 0 0 ...
##  $ MOTSEV       : num  0 0 0 0 0 2 0 0 0 0 ...
##  $ NITESEV      : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ APPSEV       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOTES_NPIQ   : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "B5_NPIQ_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2024-02-13    1947-05-13
## 3 2023-04-17    1946-12-19
## 4 2023-10-25    1967-06-15
## 5 2023-09-12    1944-04-17
## 6 2023-09-12    1952-04-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 25 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                                      
## [2] "1 thru 99999;"                         
## [3] "1 thru 9999;"                          
## [4] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;"
## [5] "1;\r\n2;"                              
## [6] "1;\r\n2;\r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

B5_NPIQ_RC <- df



B6_GDS_RC

df <- B6_GDS_RC

info(B6_GDS_RC,"SYSIND")
## #obs:543, cols:39, inds:539
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    543 obs. of  39 variables:
##  $ SYSXM         : num  8276623 8258953 8259103 8277723 8277863 ...
##  $ SYSIND        : num  11369703 11369813 11037673 11435853 11638763 ...
##  $ SYSGP         : num  7951913 7952013 7894423 7962813 8007323 ...
##  $ SYSGPSTUDY    : num  1397023 1397123 1309743 1407923 1454033 ...
##  $ SYSINDGP      : num  8138973 8139083 7793413 8205123 8407833 ...
##  $ CGI_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER  : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY        : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER      : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY         : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY      : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER        : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP            : num  88299 88301 87650 88452 104540 ...
##  $ IND           : num  1 1 9000 1 1 1 1 106 9000 1 ...
##  $ REFCTR        : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE     : POSIXct, format: "2024-02-13" "2024-02-13" ...
##  $ EXAMINER      : chr  "gsv32" "jjs2031" "gsv32" "gsv32" ...
##  $ DATE_OF_BIRTH : POSIXct, format: "1944-09-22" "1947-05-13" ...
##  $ AGE_AT_EXAM   : num  79 76 68 81 86 73 86 66 56 73 ...
##  $ REVIEW_DATE   : logi  NA NA NA NA NA NA ...
##  $ REVIEWER      : logi  NA NA NA NA NA NA ...
##  $ LIFE          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ACTIVITY      : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ EMPTY         : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ BORED         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ SPIRIT        : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ AFRAID        : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ HAPPY         : num  0 0 0 0 1 0 0 1 0 0 ...
##  $ HELPLESS      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ STAY_HOME     : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ MEMORY        : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ ALIVE         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ WORTHLESS     : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ ENERGY        : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ HOPELESS      : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ BETTER_OFF    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ INCOMPLETE_GDS: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COMMENTS_GDS  : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "B6_GDS_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("DATE", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols)
## [1] "REVIEW_DATE"
# [1] "REVIEW_DATE", ignore it, this variables have been corrected in previous step

head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13    1944-09-22
## 2 2024-02-13    1947-05-13
## 3 2023-10-24    1954-10-29
## 4 2024-02-15    1942-09-30
## 5 2023-09-13    1937-08-13
## 6 2023-05-15    1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 27 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

B6_GDS_RC <- df



B7_FAS_RC

df <- B7_FAS_RC

info(B7_FAS_RC,"SYSIND")
## #obs:435, cols:33, inds:431
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    435 obs. of  33 variables:
##  $ SYSXM        : num  8275913 8275953 8258973 8259133 8277373 ...
##  $ SYSIND       : num  11620433 11160523 11369813 11037673 11620763 ...
##  $ SYSGP        : num  8005513 7923793 7952013 7894423 8005723 ...
##  $ SYSGPSTUDY   : num  1452223 1361903 1397123 1309743 1452433 ...
##  $ SYSINDGP     : num  8389503 7923633 8139083 7793413 8389833 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  104507 87883 88301 87650 104457 ...
##  $ IND          : num  1 1 1 9000 1 1 1 1 106 9000 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2023-08-09" "2024-02-14" ...
##  $ EXAMINER     : chr  "jjs2031" "gsv32" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1944-06-21" "1939-03-20" ...
##  $ AGE_AT_EXAM  : num  79 84 76 68 76 81 73 86 66 56 ...
##  $ REVIEW_DATE  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER     : logi  NA NA NA NA NA NA ...
##  $ FAQ1         : num  0 8 0 8 0 0 0 0 0 0 ...
##  $ FAQ2         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ3         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ4         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ5         : num  0 0 0 8 1 0 0 0 1 0 ...
##  $ FAQ6         : num  0 0 0 8 1 0 0 0 1 0 ...
##  $ FAQ7         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ8         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ9         : num  0 0 0 8 1 0 0 0 0 0 ...
##  $ FAQ10        : num  0 2 0 8 1 0 0 0 1 0 ...
##  $ NOTES_B7FAS  : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "B7_FAS_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-08-09    1944-06-21
## 2 2024-02-14    1939-03-20
## 3 2024-02-13    1947-05-13
## 4 2023-10-24    1954-10-29
## 5 2023-04-17    1946-12-19
## 6 2024-02-15    1942-09-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 21 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                           "1 thru 99999;"             
## [3] "1 thru 9999;"               "0;\r\n1;\r\n2;\r\n3;\r\n8;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

B7_FAS_RC<- df



BCF_RECOG_RC

df <- BCF_RECOG_RC

info(BCF_RECOG_RC,"SYSIND")
## #obs:266, cols:24, inds:266
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    266 obs. of  24 variables:
##  $ SYSXM                 : num  8275963 8260183 8260813 8262253 8262463 ...
##  $ SYSIND                : num  11620763 11620563 11621203 11638453 11638463 ...
##  $ SYSGP                 : num  8005723 8005633 8006163 8007003 8007013 ...
##  $ SYSGPSTUDY            : num  1452433 1452343 1452873 1453713 1453723 ...
##  $ SYSINDGP              : num  8389833 8389633 8390273 8407523 8407533 ...
##  $ CGI_ORDER             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER          : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER              : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                 : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY              : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                    : num  104457 104477 104455 104549 104548 ...
##  $ IND                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ REFCTR                : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE             : POSIXct, format: "2023-04-17" "2023-05-15" ...
##  $ EXAMINER              : chr  "sjt82" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH         : POSIXct, format: "1946-12-19" "1949-12-01" ...
##  $ AGE_AT_EXAM           : num  76 73 81 74 80 74 73 70 81 91 ...
##  $ REVIEW_DATE           : logi  NA NA NA NA NA NA ...
##  $ REVIEWER              : logi  NA NA NA NA NA NA ...
##  $ CBF_RECOGNIZE_STIMULUS: num  0 0 1 1 1 1 0 1 1 1 ...
##  $ COMMENTS_BCFRECOGN    : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "BCF_RECOG_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-04-17    1946-12-19
## 2 2023-05-15    1949-12-01
## 3 2023-02-24    1941-10-04
## 4 2023-09-11    1949-05-19
## 5 2023-09-11    1942-10-17
## 6 2023-02-23    1948-11-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 12 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

BCF_RECOG_RC <- df



BCFCD_RC

df <- BCFCD_RC

info(BCFCD_RC,"SYSIND")
## #obs:269, cols:38, inds:269
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    269 obs. of  38 variables:
##  $ SYSXM                        : num  8275933 8260173 8260803 8262243 8262453 ...
##  $ SYSIND                       : num  11620763 11620563 11621203 11638453 11638463 ...
##  $ SYSGP                        : num  8005723 8005633 8006163 8007003 8007013 ...
##  $ SYSGPSTUDY                   : num  1452433 1452343 1452873 1453713 1453723 ...
##  $ SYSINDGP                     : num  8389833 8389633 8390273 8407523 8407533 ...
##  $ CGI_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER                 : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                       : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER                     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                     : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                           : num  104457 104477 104455 104549 104548 ...
##  $ IND                          : num  1 1 1 1 1 1 1 105 1 1 ...
##  $ REFCTR                       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                    : POSIXct, format: "2023-04-17" "2023-05-15" ...
##  $ EXAMINER                     : chr  "sjt82" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH                : POSIXct, format: "1946-12-19" "1949-12-01" ...
##  $ AGE_AT_EXAM                  : num  76 73 81 74 80 74 73 63 70 81 ...
##  $ REVIEW_DATE                  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                     : logi  NA NA NA NA NA NA ...
##  $ FOURSIDED_DELAY              : num  0 2 1 2 1 2 0 2 2 2 ...
##  $ STRAIGHT_LINES_DELAY         : num  0 2 1 2 2 1 0 2 2 2 ...
##  $ MIDDLETHIRD_DELAY            : num  0 0 0 1 0 1 0 2 1 0 ...
##  $ ROUND_DELAY                  : num  0 0 1 2 2 1 0 2 2 0 ...
##  $ VERTICAL_LINES_DELAY         : num  0 0 1 1 1 1 0 2 1 0 ...
##  $ BELOW3_DELAY                 : num  0 2 1 1 1 1 0 2 1 1 ...
##  $ VERTEX_DELAY                 : num  0 0 0 1 0 0 0 2 1 0 ...
##  $ GAB87_DELAY                  : num  0 1 1 1 0 1 0 1 1 1 ...
##  $ BONUS_DELAY                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TIME_HOUR_DELAY              : chr  "01:00 PM" "10:23 AM" "01:19 PM" "12:21 PM" ...
##  $ COMMENT_BCFDELAY             : chr  "Drew a landscape" NA NA NA ...
##  $ FILE_NAME1                   : chr  NA NA NA NA ...
##  $ TOTAL_SCORE_BENSON_DELAY     : num  0 7 6 11 7 8 0 15 11 6 ...
##  $ TOTAL_SCORE_BENSON_DEL_STATUS: logi  NA NA NA NA NA NA ...
##  $ PLUS_BONUS_DELAY             : num  0 7 6 11 7 8 0 15 11 6 ...
##  $ PLUS_BONUS_DELAY_STATUS      : chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "BCFCD_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
##   VarNames                      `Data Type`
##   <chr>                         <chr>      
## 1 REFCTR                        VARCHAR2(6)
## 2 REVIEW_DATE                   date       
## 3 REVIEWER                      CHAR       
## 4 TOTAL_SCORE_BENSON_DEL_STATUS CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## "REFCTR" "REVIEWER" "TOTAL_SCORE_BENSON_DEL_STATUS"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-04-17    1946-12-19
## 2 2023-05-15    1949-12-01
## 3 2023-02-24    1941-10-04
## 4 2023-09-11    1949-05-19
## 5 2023-09-11    1942-10-17
## 6 2023-02-23    1948-11-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA               "1 thru 99999;"  "1 thru 9999;"   "0;\r\n1;\r\n2;"
## [5] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

BCFCD_RC <- df



BCFCI_RC

df <- BCFCI_RC

info(BCFCI_RC,"SYSIND")
## #obs:270, cols:38, inds:270
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    270 obs. of  38 variables:
##  $ SYSXM                        : num  8260073 8260643 8260693 8261453 8278753 ...
##  $ SYSIND                       : num  11620563 11621213 11621203 11621283 11617943 ...
##  $ SYSGP                        : num  8005633 8006173 8006163 8006243 8005103 ...
##  $ SYSGPSTUDY                   : num  1452343 1452883 1452873 1452953 1451813 ...
##  $ SYSINDGP                     : num  8389633 8390283 8390273 8390353 8387013 ...
##  $ CGI_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER                 : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                       : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER                     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                     : chr  "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                           : num  104477 104456 104455 104471 104519 ...
##  $ IND                          : num  1 1 1 1 1 1 1 105 1 1 ...
##  $ REFCTR                       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                    : POSIXct, format: "2023-05-15" "2023-02-24" ...
##  $ EXAMINER                     : chr  "jjs2031" "jjs2031" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH                : POSIXct, format: "1949-12-01" "1949-06-10" ...
##  $ AGE_AT_EXAM                  : num  73 73 81 67 67 74 80 63 73 81 ...
##  $ REVIEW_DATE                  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                     : logi  NA NA NA NA NA NA ...
##  $ FOURSIDED                    : num  2 2 1 1 2 2 2 2 0 2 ...
##  $ STRAIGHT_LINES               : num  2 2 2 2 2 2 2 2 0 1 ...
##  $ MIDDLETHIRD                  : num  2 2 2 1 2 2 2 2 0 2 ...
##  $ ROUND                        : num  2 2 2 2 2 2 2 2 0 2 ...
##  $ VERTICAL_LINES               : num  2 2 2 1 2 2 2 2 0 2 ...
##  $ BELOW3                       : num  2 2 1 1 1 2 2 2 0 2 ...
##  $ VERTEX                       : num  2 2 1 1 2 2 2 2 0 2 ...
##  $ GAP87                        : num  2 2 1 1 2 2 2 2 0 2 ...
##  $ BONUS                        : num  1 1 0 0 0 1 1 1 0 0 ...
##  $ TIME_HOUR_COPY               : chr  "10:13 AM" "10:44 AM" "01:08 PM" "11:15 AM" ...
##  $ COMMENT_BCFCOPY              : chr  NA NA NA NA ...
##  $ FILE_NAME1                   : chr  NA NA NA NA ...
##  $ BCF_COPY_SCORE               : num  16 16 12 10 15 16 16 16 0 15 ...
##  $ BCF_COPY_SCORE_STATUS        : logi  NA NA NA NA NA NA ...
##  $ TOTAL_SCORE_PLUS_BONUS       : num  17 17 12 10 15 17 17 17 0 15 ...
##  $ TOTAL_SCORE_PLUS_BONUS_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "BCFCI_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
##   VarNames              `Data Type`
##   <chr>                 <chr>      
## 1 REFCTR                VARCHAR2(6)
## 2 REVIEW_DATE           date       
## 3 REVIEWER              CHAR       
## 4 BCF_COPY_SCORE_STATUS CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"  "REVIEWER"  "BCF_COPY_SCORE_STATUS"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, as it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-05-15    1949-12-01
## 2 2023-02-24    1949-06-10
## 3 2023-02-24    1941-10-04
## 4 2023-05-08    1956-04-15
## 5 2023-08-16    1956-01-09
## 6 2023-09-11    1949-05-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA               "1 thru 99999;"  "1 thru 9999;"   "0;\r\n1;\r\n2;"
## [5] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

BCFCI_RC <- df



BILINGUAL_SCALE_RC

df <- BILINGUAL_SCALE_RC

info(BILINGUAL_SCALE_RC,"SYSIND")
## #obs:240, cols:90, inds:240
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    240 obs. of  90 variables:
##  $ SYSXM               : num  8275903 8275993 8258743 8259043 8277793 ...
##  $ SYSIND              : num  11160523 11620433 11034403 11369813 11435853 ...
##  $ SYSGP               : num  7923793 8005513 7888823 7952013 7962813 ...
##  $ SYSGPSTUDY          : num  1361903 1452223 1304163 1397123 1407923 ...
##  $ SYSINDGP            : num  7923633 8389503 7790023 8139083 8205123 ...
##  $ CGI_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER        : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY              : chr  "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER            : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY               : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY            : chr  "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER              : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                  : num  87883 104507 87556 88301 88452 ...
##  $ IND                 : num  1 1 9001 1 1 ...
##  $ REFCTR              : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE           : POSIXct, format: "2024-02-14" "2023-08-09" ...
##  $ EXAMINER            : chr  "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH       : POSIXct, format: "1939-03-20" "1944-06-21" ...
##  $ AGE_AT_EXAM         : num  84 79 68 76 81 73 86 66 81 79 ...
##  $ REVIEW_DATE         : logi  NA NA NA NA NA NA ...
##  $ REVIEWER            : logi  NA NA NA NA NA NA ...
##  $ BILING_YEAR_EDU     : num  6 12 14 14 9 20 12 12 14 7 ...
##  $ BILING_LANG         : chr  "Spanish" "SPANISH" "SPANISH" "SPANISH" ...
##  $ BILING_OTHER_LANG   : num  0 1 1 0 1 0 0 1 0 0 ...
##  $ BILINGUAL_LANG_YES1 : chr  NA "ENGLISH" "SPANISH" NA ...
##  $ BILINGUAL_LANG_YES2 : chr  NA "SPANISH" "ENGLISH" NA ...
##  $ BILINGUAL_LANG_YES3 : chr  NA NA NA NA ...
##  $ BILINGUAL_LANG_YES4 : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_REGION1   : chr  NA NA NA NA ...
##  $ BILINGUAL_REGION2   : chr  NA NA NA NA ...
##  $ BILINGUAL_REGION3   : chr  NA NA NA NA ...
##  $ BILINGUAL_REGION4   : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_LENGTH1   : chr  NA NA NA NA ...
##  $ BILINGUAL_LENGTH2   : chr  NA NA NA NA ...
##  $ BILINGUAL_LENGTH3   : chr  NA NA NA NA ...
##  $ BILINGUAL_LENGTH4   : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_LANG1     : chr  NA "ENGLISH" "SPANISH" NA ...
##  $ BILINGUAL_LANG2     : chr  NA "SPANISH" "ENGLISH" NA ...
##  $ BILINGUAL_LANG3     : chr  NA NA NA NA ...
##  $ BILINGUAL_LANG4     : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_FREQUENCY1: num  NA 7 7 NA 7 NA NA NA NA NA ...
##  $ BILINGUAL_FREQUENCY2: num  NA 7 4 NA 7 NA NA NA NA NA ...
##  $ BILINGUAL_FREQUENCY3: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_FREQUENCY4: logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_LEARN1    : chr  NA "ENGLISH" "SPANISH" NA ...
##  $ BILINGUAL_LEARN2    : chr  NA "SPANISH" "ENGLISH" NA ...
##  $ BILINGUAL_LEARN3    : chr  NA NA NA NA ...
##  $ BILINGUAL_LEARN4    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_HOME1     : num  NA NA 1 NA NA NA NA 1 NA NA ...
##  $ BILINGUAL_HOME2     : num  NA 1 NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_HOME3     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_HOME4     : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_SCHOOL1   : num  NA NA 1 NA 8 NA NA NA NA NA ...
##  $ BILINGUAL_SCHOOL2   : num  NA 1 NA NA NA NA NA 1 NA NA ...
##  $ BILINGUAL_SCHOOL3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_SCHOOL4   : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_MIGRAT1   : num  NA 1 NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_MIGRAT2   : num  NA NA 1 NA 27 NA NA 1 NA NA ...
##  $ BILINGUAL_MIGRAT3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_MIGRAT4   : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_NONFORMAL1: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_NONFORMAL2: num  NA NA NA NA NA NA NA 1 NA NA ...
##  $ BILINGUAL_NONFORMAL3: logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_NONFORMAL4: logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_OTHER1    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_OTHER2    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_OTHER3    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_OTHER4    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_RATE1     : chr  NA "ENGLISH" "SPANISH" NA ...
##  $ BILINGUAL_RATE2     : chr  NA "SPANISH" "ENGLISH" NA ...
##  $ BILINGUAL_RATE3     : chr  NA NA NA NA ...
##  $ BILINGUAL_RATE4     : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_READ1     : num  NA 7 7 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_READ2     : num  NA 7 4 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_READ3     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_READ4     : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_WRITE1    : num  NA 7 7 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_WRITE2    : num  NA 7 4 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_WRITE3    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_SPEAK1    : num  NA 7 7 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_WRITE4    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_SPEAK2    : num  NA 7 4 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_SPEAK3    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_SPEAK4    : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_LISTEN1   : num  NA 7 7 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_LISTEN2   : num  NA 7 4 NA 7 NA NA 7 NA NA ...
##  $ BILINGUAL_LISTEN3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BILINGUAL_LISTEN4   : logi  NA NA NA NA NA NA ...
##  $ BILINGUAL_TIME      : num  NA NA NA NA 20 NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "BILINGUAL_SCALE_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 23 vars

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 23 × 2
##    VarNames             `Data Type` 
##    <chr>                <chr>       
##  1 REFCTR               VARCHAR2(6) 
##  2 REVIEW_DATE          date        
##  3 REVIEWER             CHAR        
##  4 BILINGUAL_LANG_YES4  VARCHAR2(25)
##  5 BILINGUAL_REGION4    VARCHAR2(10)
##  6 BILINGUAL_LENGTH4    VARCHAR2(25)
##  7 BILINGUAL_LANG4      VARCHAR2(25)
##  8 BILINGUAL_FREQUENCY4 NUMBER(2)   
##  9 BILINGUAL_LEARN4     VARCHAR2(25)
## 10 BILINGUAL_HOME4      NUMBER(2)   
## # ℹ 13 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)]

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date))

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2023-08-09    1944-06-21
## 3 2023-06-22    1954-08-20
## 4 2024-02-13    1947-05-13
## 5 2024-02-15    1942-09-30
## 6 2023-05-15    1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 33 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                                      
## [2] "1 thru 99999;"                         
## [3] "1 thru 9999;"                          
## [4] "0;\r\n1;"                              
## [5] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

BILINGUAL_SCALE_RC <- df



CAT_FLUENCY_RC

df <- CAT_FLUENCY_RC

info(CAT_FLUENCY_RC,"SYSIND")
## #obs:555, cols:29, inds:550
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    555 obs. of  29 variables:
##  $ SYSXM        : num  8276513 8258853 8258903 8260133 8277653 ...
##  $ SYSIND       : num  11369703 11369813 11037673 11620563 11435853 ...
##  $ SYSGP        : num  7951913 7952013 7894423 8005633 7962813 ...
##  $ SYSGPSTUDY   : num  1397023 1397123 1309743 1452343 1407923 ...
##  $ SYSINDGP     : num  8138973 8139083 7793413 8389633 8205123 ...
##  $ CGI_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY       : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ DB_OWNER     : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY     : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ CENTER       : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP           : num  88299 88301 87650 104477 88452 ...
##  $ IND          : num  1 1 9000 1 1 ...
##  $ REFCTR       : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE    : POSIXct, format: "2024-02-13" "2024-02-13" ...
##  $ EXAMINER     : chr  "gsv32" "jjs2031" "gsv32" "jjs2031" ...
##  $ DATE_OF_BIRTH: POSIXct, format: "1944-09-22" "1947-05-13" ...
##  $ AGE_AT_EXAM  : num  79 76 68 73 81 86 81 73 60 79 ...
##  $ REVIEW_DATE  : logi  NA NA NA NA NA NA ...
##  $ REVIEWER     : logi  NA NA NA NA NA NA ...
##  $ ANIM_ENTRY   : chr  "perro, gato, pajaritos, jirafa, cerditos, conejo, paloma, vaca, bueyes, hipopotamos, peces, aguila, avestruz, guinea" "ELEFANTE VACA CHIVE PERRO BUFALO CERDO TORO HORMISA" "perro, conejo, gato, gallina, elefante, caballo, paloma, gato, mono, leon, jirafa, lagartijo, raton, culebra" "DOG CAT BIRD LION CAMEL HORSE ZEBRA CHIT... MONKEY MULE DONKEY OSTRICH PARROT EAGLE Moj... RAT COCKROACH FISH SHARK SARDINE" ...
##  $ ANIM_SCORE   : num  14 8 13 20 19 12 20 17 25 11 ...
##  $ ANIM_STATUS  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VEG_ENTRY    : chr  "tomate, lechuga, ganganbo, repollo, peti poas, calabaza" "BAFATA NAME YAUTIA PAPA MALAGA ZANCHORIA APIO HABICHUELA TERNIA MAIZ" "platano, yautia, name, chayote, pepinillo, remolacha, esparrago, repollo, lechuga, tomate, papa, habichuelas" "MALANGA PUMPKIN PLANTAIN YUCA CORN PEAR (X)  PEACH (X) GRAPE (X) STRAWBERRY (X) Sapote (X) Mamey (X) WATERMELON"| __truncated__ ...
##  $ VEG_SCORE    : num  6 9 14 9 13 11 15 8 16 7 ...
##  $ VEG_STATUS   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NOTE_CATEGORY: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CAT_FLUENCY_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13    1944-09-22
## 2 2024-02-13    1947-05-13
## 3 2023-10-24    1954-10-29
## 4 2023-05-15    1949-12-01
## 5 2024-02-15    1942-09-30
## 6 2023-05-09    1936-05-22
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                             "1 thru 99999;"               
## [3] "1 thru 9999;"                 "995;\r\n996;\r\n997;\r\n998;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CAT_FLUENCY_RC <- df



CERAD_DEL_RC

df <- CERAD_DEL_RC

info(CERAD_DEL_RC,"SYSIND")
## #obs:177, cols:44, inds:177
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    177 obs. of  44 variables:
##  $ SYSXM                    : num  8275853 8260563 8278733 8264043 8264683 ...
##  $ SYSIND                   : num  11160523 11163453 11618053 11620393 11617573 ...
##  $ SYSGP                    : num  7923793 7924953 8005213 8005493 8004733 ...
##  $ SYSGPSTUDY               : num  1361903 1363063 1451923 1452203 1451443 ...
##  $ SYSINDGP                 : num  7923633 7926663 8387123 8389463 8386643 ...
##  $ CGI_ORDER                : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER             : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                   : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER                 : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                    : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                 : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                   : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                       : num  87883 87923 104511 104500 104525 ...
##  $ IND                      : num  1 9000 1 1 1 105 110 1 1 1 ...
##  $ REFCTR                   : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                : POSIXct, format: "2024-02-14" "2023-10-25" ...
##  $ EXAMINER                 : chr  "gsv32" "gsv32" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH            : POSIXct, format: "1939-03-20" "1967-06-15" ...
##  $ AGE_AT_EXAM              : num  84 56 77 70 91 63 65 64 76 81 ...
##  $ REVIEW_DATE              : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                 : logi  NA NA NA NA NA NA ...
##  $ WLM_CRTA                 : num  1 0 1 1 1 0 0 1 1 1 ...
##  $ WLM_CRTB                 : num  0 0 0 0 0 1 1 1 1 0 ...
##  $ WLM_CRTC                 : num  0 1 0 0 0 0 0 1 1 1 ...
##  $ WLM_CRTD                 : num  0 0 0 1 0 0 0 1 0 0 ...
##  $ WLM_CRTE                 : num  0 0 1 1 1 1 1 1 0 1 ...
##  $ WLM_CRTF                 : num  0 0 0 0 0 1 0 1 0 1 ...
##  $ WLM_CRTG                 : num  0 1 0 1 0 1 0 0 1 1 ...
##  $ WLM_CRTH                 : num  0 0 1 0 0 1 0 1 1 0 ...
##  $ WLM_CRTI                 : num  0 0 0 1 0 0 0 1 0 1 ...
##  $ WLM_CRTJ                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ WLM_INT1                 : num  1 1 NA NA NA NA NA NA NA NA ...
##  $ WLM_INT2                 : num  NA 1 NA NA NA NA NA NA NA NA ...
##  $ WLM_INT3                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_INT4                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_INT5                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NOTES_CERADRECALL        : chr  NA NA NA NA ...
##  $ WLM_CRT                  : num  1 2 3 5 2 5 2 8 5 6 ...
##  $ WLM_CRT_STATUS           : chr  NA NA NA NA ...
##  $ WLM_INT                  : num  1 2 NA NA NA NA NA NA NA NA ...
##  $ WLM_INT_STATUS           : chr  "partial" "partial" NA NA ...
##  $ SCALES_CERADRECALL       : chr  "3" "4" "5" "7" ...
##  $ SCALES_CERADRECALL_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CERAD_DEL_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2023-10-25    1967-06-15
## 3 2023-08-11    1946-06-19
## 4 2023-08-14    1952-08-29
## 5 2023-08-18    1931-09-20
## 6 2023-06-19    1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## [1] SCALES_CERADRECALL
## SCALES_CERADRECALL shows numeric in DD, but read in as character
## the reason it pops up is because they use "na" to represent the NAs
## I will correct it and convert it to numeric

df$SCALES_CERADRECALL[df$SCALES_CERADRECALL == "na"] <- NA
unique(df$SCALES_CERADRECALL)
## [1] "3"  "4"  "5"  "7"  "11" "8"  NA   "10" "13"
df$SCALES_CERADRECALL <- as.numeric(df$SCALES_CERADRECALL)

mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 29 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;\r\n0;"     
## [5] "1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CERAD_DEL_RC <- df



CERAD_IMM_RC

df <- CERAD_IMM_RC

info(CERAD_IMM_RC,"SYSIND")
## #obs:188, cols:88, inds:188
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    188 obs. of  88 variables:
##  $ SYSXM                    : num  8260413 8278683 8264003 8264323 8264633 ...
##  $ SYSIND                   : num  11163453 11618053 11620393 11618173 11617573 ...
##  $ SYSGP                    : num  7924953 8005213 8005493 8005333 8004733 ...
##  $ SYSGPSTUDY               : num  1363063 1451923 1452203 1452043 1451443 ...
##  $ SYSINDGP                 : num  7926663 8387123 8389463 8387243 8386643 ...
##  $ CGI_ORDER                : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER             : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                   : chr  "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER                 : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                    : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                 : chr  "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                   : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                       : num  87923 104511 104500 104499 104525 ...
##  $ IND                      : num  9000 1 1 1 1 105 110 1 1 1 ...
##  $ REFCTR                   : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                : POSIXct, format: "2023-10-25" "2023-08-11" ...
##  $ EXAMINER                 : chr  "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH            : POSIXct, format: "1967-06-15" "1946-06-19" ...
##  $ AGE_AT_EXAM              : num  56 77 70 81 91 63 65 69 76 83 ...
##  $ REVIEW_DATE              : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                 : logi  NA NA NA NA NA NA ...
##  $ CERAD_PRESENTATION       : num  1 2 2 1 2 1 1 2 1 2 ...
##  $ WLM_1A                   : num  0 1 1 1 1 NA NA 1 1 1 ...
##  $ WLM_1B                   : num  0 0 0 NA NA NA NA 1 NA 0 ...
##  $ WLM_1C                   : num  0 1 0 1 NA NA 1 1 1 1 ...
##  $ WLM_1D                   : num  0 0 0 NA NA NA NA NA NA 0 ...
##  $ WLM_1E                   : num  0 0 0 NA NA 1 NA NA NA 0 ...
##  $ WLM_1F                   : num  0 0 1 NA NA NA NA 1 NA 0 ...
##  $ WLM_1G                   : num  0 0 0 NA NA NA NA 1 NA 0 ...
##  $ WLM_1H                   : num  0 0 0 NA NA NA 1 NA NA 0 ...
##  $ WLM_1I                   : num  0 0 0 1 NA 1 NA NA NA 0 ...
##  $ WLM_1J                   : num  0 0 1 1 1 1 1 1 1 1 ...
##  $ WLM_1INT1                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_1INT2                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_1INT3                : logi  NA NA NA NA NA NA ...
##  $ WLM_1INT4                : logi  NA NA NA NA NA NA ...
##  $ WLM_1INT5                : logi  NA NA NA NA NA NA ...
##  $ WLM_1INT6                : logi  NA NA NA NA NA NA ...
##  $ WLM_2H                   : num  1 1 1 0 0 1 1 0 1 0 ...
##  $ WLM_2F                   : num  0 0 1 1 0 1 1 1 0 0 ...
##  $ WLM_2A                   : num  0 1 1 1 1 0 1 1 1 1 ...
##  $ WLM_2C                   : num  1 0 0 1 1 1 0 0 1 1 ...
##  $ WLM_2J                   : num  1 0 0 1 1 1 0 0 1 0 ...
##  $ WLM_2B                   : num  1 0 0 1 0 1 0 1 1 1 ...
##  $ WLM_2E                   : num  0 0 0 1 0 1 0 0 0 0 ...
##  $ WLM_2D                   : num  1 1 0 0 0 0 1 0 0 0 ...
##  $ WLM_2G                   : num  0 0 1 1 0 0 1 1 0 1 ...
##  $ WLM_2I                   : num  1 0 1 1 0 1 1 0 1 1 ...
##  $ WLM_2INT1                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_2INT2                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_2INT3                : logi  NA NA NA NA NA NA ...
##  $ WLM_2INT4                : logi  NA NA NA NA NA NA ...
##  $ WLM_2INT5                : logi  NA NA NA NA NA NA ...
##  $ WLM_2INT6                : logi  NA NA NA NA NA NA ...
##  $ WLM_3E                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ WLM_3I                   : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ WLM_3B                   : num  1 1 0 1 0 0 1 1 1 0 ...
##  $ WLM_3F                   : num  1 1 1 1 0 1 1 1 0 0 ...
##  $ WLM_3G                   : num  1 0 1 1 0 1 0 1 1 0 ...
##  $ WLM_3C                   : num  0 0 0 1 1 1 1 1 1 1 ...
##  $ WLM_3A                   : num  0 1 1 1 1 1 1 1 1 0 ...
##  $ WLM_3J                   : num  1 0 0 0 0 0 1 0 1 1 ...
##  $ WLM_3H                   : num  1 0 1 0 1 1 0 1 1 1 ...
##  $ WLM_3D                   : num  0 1 1 1 1 1 1 0 1 1 ...
##  $ WLM_3INT1                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_3INT2                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_3INT3                : logi  NA NA NA NA NA NA ...
##  $ WLM_3INT4                : logi  NA NA NA NA NA NA ...
##  $ WLM_3INT5                : logi  NA NA NA NA NA NA ...
##  $ WLM_3INT6                : logi  NA NA NA NA NA NA ...
##  $ COMMENTS_CERAD           : chr  NA NA NA NA ...
##  $ WLM_1                    : num  0 2 3 4 2 3 3 6 3 3 ...
##  $ WLM_1_STATUS             : chr  NA NA NA "partial" ...
##  $ WLM_1INT                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_1INT_STATUS          : chr  NA NA NA NA ...
##  $ WLM_2                    : num  6 3 5 8 3 7 6 4 6 5 ...
##  $ WLM_2_STATUS             : chr  NA NA NA NA ...
##  $ WLM_2INT                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_2INT_STATUS          : chr  NA NA NA NA ...
##  $ WLM_3                    : num  6 5 7 8 5 7 7 7 8 4 ...
##  $ WLM_3_STATUS             : chr  NA NA NA NA ...
##  $ WLM_3INT                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WLM_3INT_STATUS          : chr  NA NA NA NA ...
##  $ RAWSCORE_CERAD           : num  12 10 15 20 10 17 16 17 17 12 ...
##  $ RAWSCORE_CERAD_STATUS    : chr  NA NA NA "partial" ...
##  $ SCALESCORE_CERAD_2       : num  4 4 5 11 4 6 6 6 6 4 ...
##  $ SCALESCORE_CERAD_2_STATUS: chr  NA NA NA "partial" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CERAD_IMM_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 15 × 2
##    VarNames    `Data Type`
##    <chr>       <chr>      
##  1 REFCTR      VARCHAR2(6)
##  2 REVIEW_DATE date       
##  3 REVIEWER    CHAR       
##  4 WLM_1INT3   NUMBER(1)  
##  5 WLM_1INT4   NUMBER(1)  
##  6 WLM_1INT5   NUMBER(1)  
##  7 WLM_1INT6   NUMBER(1)  
##  8 WLM_2INT3   NUMBER(1)  
##  9 WLM_2INT4   NUMBER(1)  
## 10 WLM_2INT5   NUMBER(1)  
## 11 WLM_2INT6   NUMBER(1)  
## 12 WLM_3INT3   NUMBER(1)  
## 13 WLM_3INT4   NUMBER(1)  
## 14 WLM_3INT5   NUMBER(1)  
## 15 WLM_3INT6   NUMBER(1)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)]

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-10-25    1967-06-15
## 2 2023-08-11    1946-06-19
## 3 2023-08-14    1952-08-29
## 4 2023-08-07    1941-09-10
## 5 2023-08-18    1931-09-20
## 6 2023-06-19    1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 68 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;\r\n2;"     
## [5] "1;\r\n0;"      "1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CERAD_IMM_RC <- df



CERAD_RECOG_RC

df <- CERAD_RECOG_RC

info(CERAD_RECOG_RC,"SYSIND")
## #obs:177, cols:48, inds:177
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    177 obs. of  48 variables:
##  $ SYSXM          : num  8275863 8260583 8278763 8264053 8264793 ...
##  $ SYSIND         : num  11160523 11163453 11618053 11620393 11617573 ...
##  $ SYSGP          : num  7923793 7924953 8005213 8005493 8004733 ...
##  $ SYSGPSTUDY     : num  1361903 1363063 1451923 1452203 1451443 ...
##  $ SYSINDGP       : num  7923633 7926663 8387123 8389463 8386643 ...
##  $ CGI_ORDER      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER   : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY         : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER       : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY          : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY       : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER         : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP             : num  87883 87923 104511 104500 104525 ...
##  $ IND            : num  1 9000 1 1 1 105 110 1 1 1 ...
##  $ REFCTR         : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE      : POSIXct, format: "2024-02-14" "2023-10-25" ...
##  $ EXAMINER       : chr  "gsv32" "gsv32" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH  : POSIXct, format: "1939-03-20" "1967-06-15" ...
##  $ AGE_AT_EXAM    : num  84 56 77 70 91 63 65 76 81 69 ...
##  $ REVIEW_DATE    : logi  NA NA NA NA NA NA ...
##  $ REVIEWER       : logi  NA NA NA NA NA NA ...
##  $ WLRG_PRESENT   : num  1 1 2 2 2 1 1 1 1 2 ...
##  $ WLRG_K         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_L         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_A         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_M         : num  1 1 0 1 1 1 1 1 1 1 ...
##  $ WLRG_B         : num  0 1 1 1 0 1 1 1 1 1 ...
##  $ WLRG_C         : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_N         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_D         : num  0 1 1 1 0 1 0 0 0 1 ...
##  $ WLRG_O         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_P         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_E         : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ WLRG_F         : num  0 0 1 1 1 1 1 1 1 1 ...
##  $ WLRG_Q         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_G         : num  0 1 0 1 0 1 0 1 1 1 ...
##  $ WLRG_R         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_S         : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_H         : num  0 1 1 1 0 1 1 1 0 1 ...
##  $ WLRG_T         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WLRG_I         : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ WLRG_J         : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ COMMENTS_WLRG  : chr  NA NA NA NA ...
##  $ WLRG_YES       : num  2 9 9 10 6 10 6 8 8 10 ...
##  $ WLRG_YES_STATUS: logi  NA NA NA NA NA NA ...
##  $ WLRG_NO        : num  9 10 9 10 10 10 10 10 10 10 ...
##  $ WLRG_NO_STATUS : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CERAD_RECOG_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 5 × 2
##   VarNames        `Data Type`
##   <chr>           <chr>      
## 1 REFCTR          VARCHAR2(6)
## 2 REVIEW_DATE     date       
## 3 REVIEWER        CHAR       
## 4 WLRG_YES_STATUS CHAR       
## 5 WLRG_NO_STATUS  CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"  "REVIEWER" "WLRG_YES_STATUS" "WLRG_NO_STATUS" 

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14    1939-03-20
## 2 2023-10-25    1967-06-15
## 3 2023-08-11    1946-06-19
## 4 2023-08-14    1952-08-29
## 5 2023-09-18    1931-09-20
## 6 2023-06-19    1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 34 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;\r\n2;"     
## [5] "1;\r\n0;"      "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CERAD_RECOG_RC <- df



CONSENSUS_DX

df <- CONSENSUS_DX

info(CONSENSUS_DX,"SYSIND")
## #obs:1807, cols:43, inds:1584
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    1807 obs. of  43 variables:
##  $ SYSXM            : num  7583263 7583273 7583283 7583293 7583303 ...
##  $ SYSIND           : num  11039963 11063713 11063723 11063703 11064573 ...
##  $ SYSGP            : num  7896303 7896303 7896303 7896303 7896953 ...
##  $ SYSGPSTUDY       : num  1311623 1311623 1311623 1311623 1312273 ...
##  $ SYSINDGP         : num  7795703 7822643 7822653 7822633 7823493 ...
##  $ CGI_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER     : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY           : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER         : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY            : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY         : chr  "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER           : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP               : num  87663 87663 87663 87663 87682 ...
##  $ IND              : num  101 115 116 113 1008 ...
##  $ REFCTR           : logi  NA NA NA NA NA NA ...
##  $ REVIEW_DATE      : POSIXct, format: "2018-07-11" "2018-07-11" ...
##  $ REVIEWER         : chr  "v.rodriguez4" "v.rodriguez4" "v.rodriguez4" "v.rodriguez4" ...
##  $ DATE_OF_BIRTH    : POSIXct, format: "1943-10-18" "1939-08-25" ...
##  $ RANK             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CDX              : chr  "Alzheimers Disease" "Alzheimers Disease" "Alzheimers Disease" "Alzheimers Disease" ...
##  $ SUB_DX           : chr  NA NA NA NA ...
##  $ IMPRESSION       : chr  "POSSIBLE" "POSSIBLE" "POSSIBLE" "POSSIBLE" ...
##  $ WHO_DX           : chr  "MC,KC,VR" "MC,KC,VR" "MC,KC,VR" "MC,KC,VR" ...
##  $ DATE_DX          : POSIXct, format: "2018-07-11" "2018-07-11" ...
##  $ COMMENTS         : chr  NA NA NA NA ...
##  $ CLINICAL_COMMENTS: logi  NA NA NA NA NA NA ...
##  $ OTHER_TXT1       : logi  NA NA NA NA NA NA ...
##  $ OTHER_TXT2       : logi  NA NA NA NA NA NA ...
##  $ OTHER_TXT3       : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL1        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL2        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL3        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL4        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL5        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL6        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL7        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL8        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL9        : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL10       : logi  NA NA NA NA NA NA ...
##  $ CALC_VAL11       : logi  NA NA NA NA NA NA ...
##  $ LAST_SOURCE      : chr  "CHIMERA_USER" "CHIMERA_USER" "CHIMERA_USER" "CHIMERA_USER" ...
##  $ OTHER_DATE1      : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CONSENSUS_DX")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 17 × 2
##    VarNames          `Data Type`
##    <chr>             <chr>      
##  1 REFCTR            VARCHAR2(6)
##  2 CLINICAL_COMMENTS CHAR       
##  3 OTHER_TXT1        CHAR       
##  4 OTHER_TXT2        CHAR       
##  5 OTHER_TXT3        CHAR       
##  6 CALC_VAL1         NUMBER     
##  7 CALC_VAL2         NUMBER     
##  8 CALC_VAL3         NUMBER     
##  9 CALC_VAL4         NUMBER     
## 10 CALC_VAL5         NUMBER     
## 11 CALC_VAL6         NUMBER     
## 12 CALC_VAL7         NUMBER     
## 13 CALC_VAL8         NUMBER     
## 14 CALC_VAL9         NUMBER     
## 15 CALC_VAL10        NUMBER     
## 16 CALC_VAL11        NUMBER     
## 17 OTHER_DATE1       DATE
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] 
## 11 vars

## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## OTHER_DATE1

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) 
## "REFCTR"  "CLINICAL_COMMENTS" "OTHER_TXT1"  "OTHER_TXT2"  "OTHER_TXT3" 

## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "REVIEW_DATE"   "DATE_OF_BIRTH" "DATE_DX"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "OTHER_DATE1" can ignore OTHER_DATE1, as it has been corrected in previous step
## [1] "OTHER_DATE1"
head(df[,datecols])
##   REVIEW_DATE DATE_OF_BIRTH    DATE_DX
## 1  2018-07-11    1943-10-18 2018-07-11
## 2  2018-07-11    1939-08-25 2018-07-11
## 3  2018-07-11    1934-06-13 2018-07-11
## 4  2018-07-11    1924-10-24 2018-07-11
## 5  2018-07-11    1920-11-01 2018-07-11
## 6  2018-07-11    1956-06-07 2018-07-11
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## Ignore REVIEWER, for others, waiting for confirmation from Mike, should I add those invalid values to the DD?


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CONSENSUS_DX <- df



CRAFT_21_DEL_RC

df <- CRAFT_21_DEL_RC

info(CRAFT_21_DEL_RC,"SYSIND")
## #obs:523, cols:95, inds:519
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    523 obs. of  95 variables:
##  $ SYSXM            : num  8275923 8276563 8258913 8259013 8260163 ...
##  $ SYSIND           : num  11620763 11369703 11369813 11037673 11620563 ...
##  $ SYSGP            : num  8005723 7951913 7952013 7894423 8005633 ...
##  $ SYSGPSTUDY       : num  1452433 1397023 1397123 1309743 1452343 ...
##  $ SYSINDGP         : num  8389833 8138973 8139083 7793413 8389633 ...
##  $ CGI_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER     : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY           : chr  "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER         : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY            : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY         : chr  "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
##  $ CENTER           : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP               : num  104457 88299 88301 87650 104477 ...
##  $ IND              : num  1 1 1 9000 1 1 1 1 1 1 ...
##  $ REFCTR           : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE        : POSIXct, format: "2023-04-17" "2024-02-13" ...
##  $ EXAMINER         : chr  "sjt82" "gsv32" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH    : POSIXct, format: "1946-12-19" "1944-09-22" ...
##  $ AGE_AT_EXAM      : num  76 79 76 68 73 73 81 86 86 81 ...
##  $ REVIEW_DATE      : logi  NA NA NA NA NA NA ...
##  $ REVIEWER         : logi  NA NA NA NA NA NA ...
##  $ CRAFTDVR_ENTRY   : logi  NA NA NA NA NA NA ...
##  $ CRAFTDTI         : POSIXct, format: "2023-04-17 12:58:00" "2024-02-13 10:56:00" ...
##  $ CRAFTDVR1        : num  0 0 0 1 1 1 0 1 0 0 ...
##  $ CRAFTDVR2        : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ CRAFTDVR3        : num  0 1 0 0 0 1 0 0 0 0 ...
##  $ CRAFTDVR4        : num  1 1 1 1 1 0 0 1 1 0 ...
##  $ CRAFTDVR5        : num  0 0 0 1 1 1 0 1 1 1 ...
##  $ CRAFTDVR6        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR7        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR8        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR9        : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDVR10       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDVR11       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR12       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR13       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR14       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR15       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR16       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDVR17       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDVR18       : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ CRAFTDVR19       : num  0 0 0 0 0 1 0 0 1 1 ...
##  $ CRAFTDVR20       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR21       : num  1 0 0 0 1 1 1 0 0 0 ...
##  $ CRAFTDVR22       : num  1 0 1 0 1 1 0 1 1 0 ...
##  $ CRAFTDVR23       : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ CRAFTDVR24       : num  1 0 0 0 0 1 0 1 0 0 ...
##  $ CRAFTDVR25       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ CRAFTDVR26       : num  1 0 0 0 1 1 0 0 0 0 ...
##  $ CRAFTDVR27       : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ CRAFTDVR28       : num  0 1 0 0 0 1 1 1 1 1 ...
##  $ CRAFTDVR29       : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ CRAFTDVR30       : num  0 0 0 0 1 1 0 0 1 0 ...
##  $ CRAFTDVR31       : num  0 0 0 0 1 1 0 0 1 0 ...
##  $ CRAFTDVR32       : num  0 1 0 0 1 1 0 0 1 1 ...
##  $ CRAFTDVR33       : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ CRAFTDVR34       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR35       : num  0 0 0 0 1 0 0 1 0 0 ...
##  $ CRAFTDVR36       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDVR37       : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ CRAFTDVR38       : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ CRAFTDVR39       : num  0 0 0 1 1 1 0 0 1 0 ...
##  $ CRAFTDVR40       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDVR41       : num  0 0 0 1 1 1 0 0 1 1 ...
##  $ CRAFTDVR42       : num  0 0 0 1 0 1 0 0 0 0 ...
##  $ CRAFTDVR43       : num  0 0 0 1 1 1 0 0 1 1 ...
##  $ CRAFTDVR44       : num  0 0 0 1 1 1 0 1 1 1 ...
##  $ CRAFTDRE1        : num  0 0 0 1 1 1 0 1 1 0 ...
##  $ CRAFTDRE2        : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ CRAFTDRE3        : num  0 1 0 0 0 1 0 0 0 1 ...
##  $ CRAFTDRE4        : num  1 1 1 1 1 0 0 1 1 1 ...
##  $ CRAFTDRE5        : num  0 0 0 1 1 1 0 1 1 1 ...
##  $ CRAFTDRE6        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDRE7        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDRE8        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDRE9        : num  1 1 0 0 0 0 0 0 0 0 ...
##  $ CRAFTDRE10       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDRE11       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ CRAFTDRE12       : num  0 0 0 0 0 1 0 0 1 1 ...
##  $ CRAFTDRE13       : num  1 0 0 0 1 1 0 0 1 1 ...
##  $ CRAFTDRE14       : num  1 0 1 1 1 1 1 1 1 0 ...
##  $ CRAFTDRE15       : num  1 0 0 0 0 1 0 1 0 0 ...
##  $ CRAFTDRE16       : num  1 0 0 1 1 1 0 0 0 0 ...
##  $ CRAFTDRE17       : num  1 1 0 1 0 1 1 1 1 1 ...
##  $ CRAFTDRE18       : num  0 0 0 0 1 1 0 0 1 0 ...
##  $ CRAFTDRE19       : num  0 0 0 1 1 1 0 0 1 0 ...
##  $ CRAFTDRE20       : num  0 1 0 0 1 1 0 0 1 0 ...
##  $ CRAFTDRE21       : num  0 0 0 0 1 1 1 1 1 1 ...
##  $ CRAFTDRE22       : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ CRAFTDRE23       : num  0 0 0 1 1 1 0 0 1 0 ...
##  $ CRAFTDRE24       : num  0 0 0 1 1 1 0 0 1 1 ...
##  $ CRAFTDRE25       : num  0 0 0 1 1 1 1 0 1 1 ...
##  $ CRAFTCUE         : num  0 1 0 0 1 1 1 1 1 0 ...
##  $ COMMENTS_CRAFTDRE: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CRAFT_21_DEL_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
##   VarNames       `Data Type`  
##   <chr>          <chr>        
## 1 REFCTR         VARCHAR2(6)  
## 2 REVIEW_DATE    date         
## 3 REVIEWER       CHAR         
## 4 CRAFTDVR_ENTRY VARCHAR2(500)
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"         "REVIEWER"       "CRAFTDVR_ENTRY"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" "CRAFTDTI"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH            CRAFTDTI
## 1 2023-04-17    1946-12-19 2023-04-17 12:58:00
## 2 2024-02-13    1944-09-22 2024-02-13 10:56:00
## 3 2024-02-13    1947-05-13 2024-02-13 10:59:00
## 4 2023-10-24    1954-10-29 2023-10-24 14:33:00
## 5 2023-05-15    1949-12-01 2023-05-15 10:23:00
## 6 2023-05-15    1950-04-02 2023-05-15 12:07:00
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 81 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CRAFT_21_DEL_RC <- df



CRAFT_21_IMM_RC

df <- CRAFT_21_IMM_RC

info(CRAFT_21_IMM_RC,"SYSIND")
## #obs:530, cols:98, inds:525
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    530 obs. of  98 variables:
##  $ SYSXM                : num  8258833 8258863 8260063 8277603 8277783 ...
##  $ SYSIND               : num  11369813 11037673 11620563 11435853 11638763 ...
##  $ SYSGP                : num  7952013 7894423 8005633 7962813 8007323 ...
##  $ SYSGPSTUDY           : num  1397123 1309743 1452343 1407923 1454033 ...
##  $ SYSINDGP             : num  8139083 7793413 8389633 8205123 8407833 ...
##  $ CGI_ORDER            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER         : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY               : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" ...
##  $ DB_OWNER             : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY             : chr  "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" ...
##  $ CENTER               : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                   : num  88301 87650 104477 88452 104540 ...
##  $ IND                  : num  1 9000 1 1 1 1 105 1 1 1 ...
##  $ REFCTR               : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE            : POSIXct, format: "2024-02-13" "2023-10-24" ...
##  $ EXAMINER             : chr  "jjs2031" "gsv32" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH        : POSIXct, format: "1947-05-13" "1954-10-29" ...
##  $ AGE_AT_EXAM          : num  76 68 73 81 86 86 71 73 81 79 ...
##  $ REVIEW_DATE          : logi  NA NA NA NA NA NA ...
##  $ REVIEWER             : logi  NA NA NA NA NA NA ...
##  $ CRAFTVRS_ENTRY       : logi  NA NA NA NA NA NA ...
##  $ CRAFTVRS_TIME        : POSIXct, format: "2024-02-13 10:41:00" "2023-10-24 14:18:00" ...
##  $ CRAFTVRS1            : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ CRAFTVRS2            : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ CRAFTVRS3            : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ CRAFTVRS4            : num  1 1 1 1 1 0 0 0 1 1 ...
##  $ CRAFTVRS6            : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS7            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS8            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS5            : num  0 1 1 0 1 1 0 0 1 1 ...
##  $ CRAFTVRS9            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS10           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS11           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS12           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS13           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ CRAFTVRS14           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ CRAFTVRS15           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ CRAFTVRS16           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS17           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS18           : num  0 0 1 0 0 0 0 0 1 1 ...
##  $ CRAFTVRS19           : num  0 0 1 0 0 0 0 0 1 1 ...
##  $ CRAFTVRS20           : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ CRAFTVRS21           : num  0 1 1 0 0 0 0 1 1 1 ...
##  $ CRAFTVRS22           : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ CRAFTVRS23           : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ CRAFTVRS24           : num  0 0 1 0 0 0 0 1 1 0 ...
##  $ CRAFTVRS25           : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ CRAFTVRS26           : num  0 1 1 0 1 0 0 1 0 1 ...
##  $ CRAFTVRS27           : num  0 1 1 0 1 0 0 1 0 1 ...
##  $ CRAFTVRS28           : num  1 1 1 1 1 0 1 0 1 0 ...
##  $ CRAFTVRS29           : num  0 1 0 0 0 1 0 0 0 0 ...
##  $ CRAFTVRS30           : num  0 1 1 0 0 1 0 0 0 0 ...
##  $ CRAFTVRS31           : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ CRAFTVRS32           : num  0 1 1 0 1 1 0 1 1 1 ...
##  $ CRAFTVRS33           : num  0 1 1 0 1 0 0 1 0 1 ...
##  $ CRAFTVRS34           : num  0 1 1 0 1 0 0 0 0 1 ...
##  $ CRAFTVRS35           : num  0 0 1 0 1 1 0 0 0 1 ...
##  $ CRAFTVRS36           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS37           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ CRAFTVRS38           : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ CRAFTVRS39           : num  0 1 0 0 0 1 0 0 0 1 ...
##  $ CRAFTVRS40           : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ CRAFTVRS41           : num  0 1 1 0 1 1 0 0 1 1 ...
##  $ CRAFTVRS42           : num  0 1 1 0 1 0 0 0 0 1 ...
##  $ CRAFTVRS43           : num  0 1 1 0 1 1 0 0 1 0 ...
##  $ CRAFTVRS44           : num  0 1 1 1 1 0 1 0 1 0 ...
##  $ CRAFTURS1            : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ CRAFTURS2            : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ CRAFTURS3            : num  0 1 0 0 0 0 0 0 0 1 ...
##  $ CRAFTURS4            : num  1 1 1 1 1 1 0 0 1 1 ...
##  $ CRAFTURS5            : num  0 1 1 0 1 1 0 0 1 1 ...
##  $ CRAFTURS6            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTURS7            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTURS8            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRAFTURS9            : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ CRAFTURS10           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ CRAFTURS11           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ CRAFTURS12           : num  0 1 1 0 0 0 0 0 1 1 ...
##  $ CRAFTURS13           : num  0 1 1 0 0 1 0 1 1 1 ...
##  $ CRAFTURS14           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CRAFTURS15           : num  0 1 1 0 0 0 0 1 1 0 ...
##  $ CRAFTURS16           : num  0 1 1 0 1 0 0 1 0 1 ...
##  $ CRAFTURS17           : num  1 1 1 1 1 1 1 0 1 0 ...
##  $ CRAFTURS18           : num  0 1 0 0 0 1 0 0 0 0 ...
##  $ CRAFTURS19           : num  0 1 1 0 0 1 0 1 0 1 ...
##  $ CRAFTURS20           : num  0 0 1 0 1 1 1 1 1 1 ...
##  $ CRAFTURS21           : num  0 0 1 1 1 1 0 0 1 1 ...
##  $ CRAFTURS22           : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ CRAFTURS24           : num  0 1 1 0 1 0 0 0 1 1 ...
##  $ CRAFTURS23           : num  0 1 0 0 1 1 0 0 0 1 ...
##  $ CRAFTURS25           : num  0 1 1 1 1 1 0 0 1 0 ...
##  $ COMMENTS_CRAFTVRS    : chr  NA NA NA NA ...
##  $ SCORE_CRAFTVRS       : num  5 22 27 6 19 14 2 11 14 20 ...
##  $ SCORE_CRAFTVRS_STATUS: chr  NA NA NA NA ...
##  $ SCORE_CRAFTURS       : num  6 18 16 7 13 14 3 8 12 13 ...
##  $ SCORE_CRAFTURS_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "CRAFT_21_IMM_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
##   VarNames       `Data Type`  
##   <chr>          <chr>        
## 1 REFCTR         VARCHAR2(6)  
## 2 REVIEW_DATE    date         
## 3 REVIEWER       CHAR         
## 4 CRAFTVRS_ENTRY VARCHAR2(500)
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## "REFCTR"         "REVIEWER"       "CRAFTVRS_ENTRY"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" "CRAFTVRS_TIME" 

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH       CRAFTVRS_TIME
## 1 2024-02-13    1947-05-13 2024-02-13 10:41:00
## 2 2023-10-24    1954-10-29 2023-10-24 14:18:00
## 3 2023-05-15    1949-12-01 2023-05-15 10:12:00
## 4 2024-02-15    1942-09-30 2024-02-15 14:22:00
## 5 2023-09-13    1937-08-13 2023-09-13 10:02:00
## 6 2023-05-09    1936-05-22 2023-05-09 11:46:00
## convert format
## I will leave CRAFTVRS_TIME with format POSIXct since it contains the timestamp
## and I will convert the other two to date format
datecols <- setdiff(datecols, "CRAFTVRS_TIME")
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "Date"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 12 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 82 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

CRAFT_21_IMM_RC <- df



MEDCON_RC

df <- MEDCON_RC

info(MEDCON_RC,"SYSIND")
## #obs:627, cols:237, inds:618
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    627 obs. of  237 variables:
##  $ SYSXM                     : num  8258763 8258803 8260083 8277583 8277993 ...
##  $ SYSIND                    : num  11037673 11369813 11362953 11435853 11621333 ...
##  $ SYSGP                     : num  7894423 7952013 7946353 7962813 8006293 ...
##  $ SYSGPSTUDY                : num  1309743 1397123 1387463 1407923 1453003 ...
##  $ SYSINDGP                  : num  7793413 8139083 8132223 8205123 8390403 ...
##  $ CGI_ORDER                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER              : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                    : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ DB_OWNER                  : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                     : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                  : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
##  $ CENTER                    : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                        : num  87650 88301 87545 88452 104528 ...
##  $ IND                       : num  9000 1 106 1 1 ...
##  $ REFCTR                    : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                 : POSIXct, format: "2023-10-24" "2024-02-13" ...
##  $ EXAMINER                  : chr  "gsv32" "jjs2031" "jjs2031" "gsv32" ...
##  $ DATE_OF_BIRTH             : POSIXct, format: "1954-10-29" "1947-05-13" ...
##  $ AGE_AT_EXAM               : num  68 76 66 81 86 86 60 81 79 67 ...
##  $ REVIEW_DATE               : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                  : logi  NA NA NA NA NA NA ...
##  $ MEMORY_COMPLAINTS         : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ DATE_OF_ONSET             : POSIXct, format: NA NA ...
##  $ DOA_UNK                   : chr  NA NA NA NA ...
##  $ DESCRIBE                  : chr  "hysterectomy (1987), no HET, knee surgery 2010, carpal tunnel surgery 2011" NA NA "Hypertension x 20yrs; left knee prothesis due to osteoarthrosis, right knee in need of surgery; hypercholest.; "| __truncated__ ...
##  $ MEM_COMPLAINTS            : chr  "68 y/o mixed female born in PR. Oriented in time, space and person. No memory complaints, however complaints of"| __truncated__ "NO MEMORY COMPLAINTS. PERSON ORIENTED IN TIME, SPACE, AND PERSON. HE LIVES ALONE, HE DOESN'T NEED HELP TO CHANG"| __truncated__ "YES MEMORY COMPLAINTS. ORIENTED EN TIME, SPACE, AND PERSONA. SHE SAYS THAT HER MEMORY WAS FULL AND WELL DURING "| __truncated__ "Refers no major changes in memory.  Remembers phone numbers and addresses well.  He is 81y/o, with 9yrs of educ"| __truncated__ ...
##  $ CURRENT_MED               : chr  "high blood pressure, sleep apnea, diabetes (10 years ago) arthritis (13 years ago)" "DM 10 Y/0 HIGH BLOOD PRESSURE 10 Y/0" "HYPOTHYROIDISM 30 Y/O CHOLESTEROL 10 Y/O DM 5 YEARS AGO BREAST CANCER 2013 DEPRESSION 2013 ASTHMA 4 Y/O ARTHIRITIS 2017." "see above" ...
##  $ PMH                       : chr  NA NA NA "see above" ...
##  $ MOOD_CHANGES              : chr  "None reported" "NO DEPRESSION OR ANXIETY" "YES DEPRESSION AND ANXEITY" "H/o depression and anxiety x 20yrs, with meds, was with psychiatrist but not anymore" ...
##  $ HYPERTENSION_DX           : num  1 1 0 1 1 1 0 0 1 1 ...
##  $ HYPERTENSION_TREATED      : num  1 1 -1 1 1 1 0 -1 1 1 ...
##  $ DIABETES_DX               : num  1 1 1 1 1 0 0 0 1 1 ...
##  $ DIABETES_TREATED          : num  1 1 1 1 1 0 0 -1 1 1 ...
##  $ MYOCARDIAL_DX             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MYOCARDIAL_TREATED        : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ HEART_FAILURE_DX          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEART_FAILURE_TREATED     : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ HEART_DISEASE_DX          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEART_DISEASE_TREATED     : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ COPD_DX                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COPD_TREATED              : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ THYROID_DX                : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ THYROID_TREATED           : num  0 -1 1 NA -1 0 0 -1 NA -1 ...
##  $ LIVER_DX                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LIVER_TREATED             : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ RENAL_DX                  : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ RENAL_TREATED             : num  0 -1 -1 NA 1 0 0 -1 NA -1 ...
##  $ PEPTIC_DX                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PEPTIC_TREATED            : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ PERIPHERAL_DX             : num  1 0 0 1 0 0 0 0 0 1 ...
##  $ PERIPHERAL_TREATED        : num  0 -1 -1 1 -1 0 0 -1 NA 0 ...
##  $ STROKE_DX                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ STROKE_TREATED            : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ TIA_DX                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TIA_TREATED               : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ HEAD_INJURY_DX            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEAD_INJURY_TREATED       : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ SEIZURE_DX                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEIZURE_TREATED           : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ CANCER_DX                 : num  0 0 1 0 1 0 0 0 0 0 ...
##  $ CANCER_TREATED            : num  0 -1 0 NA 1 0 0 -1 NA -1 ...
##  $ ARTHRITIS_DX              : num  1 0 1 1 1 1 0 1 1 1 ...
##  $ ARTHRITIS_TREATED         : num  1 -1 1 1 1 1 0 1 1 0 ...
##  $ SYPHILIS_DX               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SYPHILIS_TREATED          : num  0 -1 -1 0 -1 0 0 -1 NA -1 ...
##  $ ALCOHOL_DX                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ALCOHOL_TREATED           : num  0 -1 -1 0 -1 0 0 -1 NA -1 ...
##  $ ILLICIT_DRUG_DX           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ILLICIT_DRUG_TREATED      : num  0 -1 -1 0 -1 0 0 -1 NA -1 ...
##  $ SMOKING_DX                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SMOKING_TREATED           : num  0 -1 -1 0 -1 0 0 -1 NA -1 ...
##  $ PD_DX                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PD_TREATED                : num  0 -1 -1 0 -1 0 0 -1 NA -1 ...
##  $ HUNTINGTON_DX             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HUNTINGTON_TREATED        : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ MULTIPLE_SCLEROSIS_DX     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MULTIPLE_SCLEROSIS_TREATED: num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ B12_DX                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ B12_TREATED               : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ HYDROCEPHALUS_DX          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HYDROCEPHALUS_TREATED     : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ TREMOR_DX                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TREMOR_TREATED            : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ DOWN_SYNDROME_DX          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DOWN_SYNDROME_TREATED     : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...
##  $ MED_CONDITIONS_DX         : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ MED_CONDITIONS_TREATED    : num  0 -1 -1 1 -1 0 0 -1 NA -1 ...
##  $ OTH_MED_COND_SP           : chr  NA NA NA "depression and anxiety" ...
##  $ STROKE_BRAIN              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DOCTOR                    : num  NA 9 9 NA 9 0 NA 9 0 9 ...
##  $ STROKE_PAST               : num  NA 9 9 NA 9 0 NA 9 0 9 ...
##  $ STROKE_24HRS              : num  NA 9 9 NA 9 0 NA 9 0 9 ...
##  $ SYMPTOMS                  : num  NA 9 9 NA 9 0 NA 9 0 9 ...
##  $ LOST_SPEECH               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LOST_UNDERSTAND           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LOSS_CONSCIOUS            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ WEAKNESS                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUMBNESS                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LOSS_VISION               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HALF_VISION               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PERIOD                    : num  1 0 1 1 1 1 0 1 1 0 ...
##  $ AGE_C24A                  : num  67 NA 55 60 59 86 NA 77 70 NA ...
##  $ DONT_KNOW                 : chr  NA NA NA NA ...
##  $ SEEK_HELP                 : num  1 NA 1 1 0 1 NA 1 1 NA ...
##  $ TREATMENT                 : num  0 NA 0 0 0 0 NA 0 0 NA ...
##  $ MEDS                      : num  1 NA 1 1 0 0 NA 1 1 NA ...
##  $ PSYCHOTHERAPY             : num  1 NA 1 1 0 1 NA 1 1 NA ...
##  $ OTHER                     : num  0 NA 0 0 0 0 NA 0 0 NA ...
##  $ UNKNOWN                   : num  0 NA 0 0 0 0 NA 0 0 NA ...
##  $ SPECIFY_OTHER             : chr  NA NA NA NA ...
##  $ TAKING_MEDS               : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ MEDICATION1               : chr  "aspirin" NA "LEVOTHYROXINE" "doesn't remember medications" ...
##  $ STRENGTH1                 : chr  "81 mg" NA "112 MG DAILY" NA ...
##  $ SEEN1                     : num  0 NA 0 0 0 NA 0 0 NA 0 ...
##  $ MEDICATION2               : chr  "lipidol" NA "MEMANTINE HCL" NA ...
##  $ STRENGTH2                 : chr  "20 mg" NA "10 MG TWICE DAILY" NA ...
##  $ SEEN2                     : chr  "0" NA "0" NA ...
##  $ MEDICATION3               : chr  "zyrtec" NA "XISDUO XR(METFORMIN HCL)" NA ...
##  $ STRENGTH3                 : chr  "10 mg" NA "5MG/1000MG" NA ...
##  $ SEEN3                     : num  0 NA 0 NA 0 NA NA 0 NA 0 ...
##  $ MEDICATION4               : chr  "vitamin D3" NA "ESCITALOPRAM" NA ...
##  $ STRENGTH4                 : chr  "50,000 d" NA "20 MG 1 DAILY" NA ...
##  $ SEEN4                     : chr  "0" NA "0" NA ...
##  $ MEDICATION5               : chr  "folic acid" NA "ATORVASTATIN CALCIUM" NA ...
##  $ STRENGTH5                 : chr  "1 mg" NA "20 MG" NA ...
##  $ SEEN5                     : num  0 NA 0 NA 0 NA NA 0 NA NA ...
##  $ MEDICATION6               : chr  "daflonex" NA "FOROTIDINE" NA ...
##  $ STRENGTH6                 : chr  "XL as indicated" NA "20 MG DAILY" NA ...
##  $ SEEN6                     : chr  "0" NA "0" NA ...
##  $ MEDICATION7               : chr  "methenamine" NA "LISINIPROL" NA ...
##  $ STRENGTH7                 : chr  "500 mg" NA "10 MG DAILY" NA ...
##  $ SEEN7                     : num  0 NA 0 NA 0 NA NA 0 NA NA ...
##  $ MEDICATION8               : chr  "methnotexate" NA "MONTELUKAST SODIUM" NA ...
##  $ STRENGTH8                 : chr  "2.5 mg" NA "10 MG DAILY" NA ...
##  $ SEEN8                     : chr  "0" NA "0" NA ...
##  $ MEDICATION9               : chr  "lexapro" NA "FOLIC ACID" NA ...
##  $ STRENGTH9                 : chr  "10 mg" NA "1 MG DAILY" NA ...
##  $ SEEN9                     : num  0 NA 0 NA 0 NA NA NA NA NA ...
##  $ MEDICATION10              : chr  "frova" NA "VITAMIN D" NA ...
##  $ STRENGTH10                : chr  "2.5 mg" NA NA NA ...
##  $ SEEN10                    : chr  "0" NA "0" NA ...
##  $ MEDICATION11              : chr  "mirapen" NA "BIOTIN" NA ...
##  $ STRENGTH11                : chr  ".5 mg" NA NA NA ...
##  $ SEEN11                    : num  0 NA 0 NA NA NA NA NA NA NA ...
##  $ MEDICATION12              : chr  "lyrica" NA NA NA ...
##  $ STRENGTH12                : chr  "25 mg" NA NA NA ...
##  $ SEEN12                    : chr  NA NA NA NA ...
##  $ MEDICATION13              : chr  "lisinopril" NA NA NA ...
##  $ STRENGTH13                : chr  "5 mg" NA NA NA ...
##  $ SEEN13                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDICATION14              : chr  "pepcid" NA NA NA ...
##  $ STRENGTH14                : chr  "20 mg" NA NA NA ...
##  $ SEEN14                    : chr  NA NA NA NA ...
##  $ MEDICATION15              : chr  NA NA NA NA ...
##  $ STRENGTH15                : chr  NA NA NA NA ...
##  $ SEEN15                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDICATION16              : chr  NA NA NA NA ...
##  $ STRENGTH16                : chr  NA NA NA NA ...
##  $ SEEN16                    : chr  NA NA NA NA ...
##  $ MEDICATION17              : chr  NA NA NA NA ...
##  $ STRENGTH17                : chr  NA NA NA NA ...
##  $ SEEN17                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDICATION18              : chr  NA NA NA NA ...
##  $ STRENGTH18                : logi  NA NA NA NA NA NA ...
##  $ SEEN18                    : chr  NA NA NA NA ...
##  $ MEDICATION19              : chr  NA NA NA NA ...
##  $ STRENGTH19                : chr  NA NA NA NA ...
##  $ SEEN19                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDICATION20              : logi  NA NA NA NA NA NA ...
##  $ STRENGTH20                : logi  NA NA NA NA NA NA ...
##  $ SEEN20                    : logi  NA NA NA NA NA NA ...
##  $ NOTES_MEDLIST             : chr  NA NA NA NA ...
##  $ WARFARIN                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ASPIRIN                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIPLATELETS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIURETICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTICONVULSANTS           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ INSULIN                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYPOGLYCEMICS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SULFONYLUREA              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ METFORMIN                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GLITAZONES                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIGITALIS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NITRATES                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CALCIUM_CHANNEL           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BETA_2_AGAONIST           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ BETA_BLOCKERS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ACE                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTI_ARRHYTHMICS          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTI_HYPERLIPIDEMICS      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ STATIN_DRUG               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FIBRATE_DRUG              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ THYROID                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTICHOLINERGICS          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LEVODOPA                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOPAMINE1                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIDEPRESSANTS           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANTIPSYCHOTICS            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANXIOLYTICS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CHOLINESTERASE            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RIVASTIGMINE              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ TACRINE                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DONEPEZIL                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GALANTAMINE               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NMDA                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEMANTINE                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ALPHA_BLOCKERS            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYPNOTICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ H1_BLOCKERS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ H2_BLOCKERS               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NSAID                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COX2                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NARCOTICS                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ HYDERGINE                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DEPRENYL                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ESTROGEN_SUPP             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRESCRIPTION              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OTC                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ STEROIDS                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OTHER_MEDS                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ C57_SPEC_MEDS             : chr  NA NA NA NA ...
##  $ MULTIVITAMINS             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_C                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_E                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMINE_B12              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ COENZYME_Q                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ DHA                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LECITHIN                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GINKGO                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FOLIC_ACID                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_B6                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ VITAMIN_D                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ OMEGA3                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MEDCOND_COMENTS           : chr  NA NA NA NA ...
##  $ MED_CONDITIONS_HIV        : num  0 0 0 0 0 0 0 0 NA 0 ...
##  $ MED_CONDITIONS_HIV_TX     : num  0 -1 -1 NA -1 0 0 -1 NA -1 ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "MEDCON_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 7 vars

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 7 × 2
##   VarNames     `Data Type` 
##   <chr>        <chr>       
## 1 REFCTR       VARCHAR2(6) 
## 2 REVIEW_DATE  DATE        
## 3 REVIEWER     CHAR        
## 4 STRENGTH18   VARCHAR2(30)
## 5 MEDICATION20 VARCHAR2(30)
## 6 STRENGTH20   VARCHAR2(30)
## 7 SEEN20       NUMBER(1)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2num], as.numeric)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH" "DATE_OF_ONSET"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH DATE_OF_ONSET
## 1 2023-10-24    1954-10-29          <NA>
## 2 2024-02-13    1947-05-13          <NA>
## 3 2024-02-20    1957-08-05    2021-06-01
## 4 2024-02-15    1942-09-30          <NA>
## 5 2023-05-09    1936-05-22          <NA>
## 6 2023-09-13    1937-08-13    2023-04-01
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 69 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## 9 vars
## [1] "SEEN2"  "SEEN4"  "SEEN6"  "SEEN8"  "SEEN10" "SEEN12" "SEEN14" "SEEN16" "SEEN18"
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## convert mismatchChrs_1 vars to numeric
df[mismatchChrs_1] <- lapply(df[mismatchChrs_1], as.numeric)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 164 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA                           "1 thru 99999;"             
## [3] "1 thru 9999;"               "0;\r\n1;"                  
## [5] "0;\r\n1;\r\n9;\r\n-1;"      "0;\r\n1;\r\n7;\r\n8;\r\n9;"
## [7] "0;\r\n1;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

MEDCON_RC <- df



MEDICAL_HIST

df <- MEDICAL_HIST

info(MEDICAL_HIST,"SYSIND")
## #obs:889, cols:53, inds:871
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    889 obs. of  53 variables:
##  $ SYSXM          : num  7606563 7493573 7592623 7576033 7596083 ...
##  $ SYSIND         : num  11163223 11037553 11160533 11158043 11007943 ...
##  $ SYSGP          : num  7924813 7894373 7896973 7896073 7888893 ...
##  $ SYSGPSTUDY     : num  1362923 1309693 1312293 1311393 1304233 ...
##  $ SYSINDGP       : num  7926433 7793293 7923643 7921153 7762743 ...
##  $ CGI_ORDER      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER   : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY         : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ DB_OWNER       : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY          : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY       : chr  "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
##  $ CENTER         : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP             : num  87927 87502 87684 87511 87564 ...
##  $ IND            : num  1 1 103 9006 100 ...
##  $ REFCTR         : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE      : POSIXct, format: "2018-10-17" "2018-01-08" ...
##  $ EXAMINER       : chr  "v.rodriguez4" "axr1589" "axr1589" "axr1589" ...
##  $ DATE_OF_BIRTH  : POSIXct, format: "1933-05-18" "1952-07-06" ...
##  $ AGE_AT_EXAM    : num  85 65 59 60 68 72 77 72 67 88 ...
##  $ XMSTUDY        : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ RELATION       : chr  "Parent" "Spouse" "Spouse" "Spouse" ...
##  $ ANXIETY        : chr  "Y" "N" "Y" "Y" ...
##  $ ASTHMA         : chr  "N" "N" "N" "N" ...
##  $ A_D_D          : chr  "N" "U" "N" "N" ...
##  $ AUTISM         : chr  "N" "N" "N" "N" ...
##  $ CANCER         : chr  "N" "N" "N" "N" ...
##  $ CANCER_TYPE    : chr  NA NA NA NA ...
##  $ DEPRESSION     : chr  "Y" "N" "N" "Y" ...
##  $ DIABETES_TYPE1 : chr  "N" "N" "N" "N" ...
##  $ DIABETES_TYPE2 : chr  "N" "N" "Y" "N" ...
##  $ DIABETES       : chr  "N" "N" "Y" "N" ...
##  $ LIPIDS_CHOL    : chr  "Y" "N" "N" "Y" ...
##  $ EPILEPSY       : chr  "N" "N" "N" "N" ...
##  $ GASTRIC_ULCERS : chr  "N" "N" "N" "N" ...
##  $ HEART_DISEASE  : chr  "N" "N" "N" "N" ...
##  $ HYPERTENSION   : chr  "Y" "N" "Y" "N" ...
##  $ KIDNEY_DISEASE : chr  "N" "N" "N" "N" ...
##  $ LIVER_DISEASE  : chr  "N" "N" "N" "N" ...
##  $ DEMENTIA       : chr  "Y" "Y" "Y" "Y" ...
##  $ MIGRAINES      : chr  "N" "N" "N" "U" ...
##  $ M_SCLEROSIS    : chr  "N" "N" "N" "N" ...
##  $ OBS_COMPULSIVE : chr  "N" "N" "N" "Y" ...
##  $ OSTEOARTHRITIS : chr  "N" "N" "N" "N" ...
##  $ OSTEOPOROSIS   : chr  "Y" "N" "N" "N" ...
##  $ PD             : chr  "N" "N" "N" "N" ...
##  $ ARTHRITIS      : chr  "N" "N" "N" "N" ...
##  $ RHINITIS       : chr  "N" "N" "N" "N" ...
##  $ SPINA_BIFIDA   : chr  "N" "N" "N" "N" ...
##  $ STROKE         : chr  "N" "N" "N" "N" ...
##  $ THYROID_DISEASE: chr  "Y" "N" "N" "Y" ...
##  $ CIGARETTES     : logi  NA NA NA NA NA NA ...
##  $ CURR_MEDS      : logi  NA NA NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "MEDICAL_HIST")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames   `Data Type`
##   <chr>      <chr>      
## 1 REFCTR     VARCHAR2(6)
## 2 CIGARETTES CHAR       
## 3 CURR_MEDS  CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2018-10-17    1933-05-18
## 2 2018-01-08    1952-07-06
## 3 2018-08-21    1958-10-31
## 4 2018-06-08    1957-10-06
## 5 2018-06-07    1949-07-20
## 6 2018-06-28    1946-01-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 40 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore CANCER_TYPE, as it is a multiple values variable


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 11 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## All numeric values are within valid ranges.
## ignore GP


Save Cleaned Data

MEDICAL_HIST <- df



MINT_RC

df <- MINT_RC

info(MINT_RC,"SYSIND")
## #obs:3, cols:221, inds:3
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    3 obs. of  221 variables:
##  $ SYSXM                    : num  8247903 8300263 8342313
##  $ SYSIND                   : num  11660243 11676853 11667133
##  $ SYSGP                    : num  8011553 8017323 7946313
##  $ SYSGPSTUDY               : num  1458263 1464033 1387423
##  $ SYSINDGP                 : num  8429313 8445923 8436203
##  $ CGI_ORDER                : num  1 1 1
##  $ GPS_ORDER                : num  1 1 1
##  $ STDCGI_ORDER             : num  11 11 11
##  $ LSTUDY                   : chr  "HAFS" "HAFS" "ADCONTROL"
##  $ DB_OWNER                 : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER"
##  $ STUDY                    : chr  "ALZ" "ALZ" "ALZ"
##  $ SUBSTUDY                 : chr  "HAFS" "HAFS" "ADCONTROL"
##  $ CENTER                   : chr  "IHG" "IHG" "IHG"
##  $ GP                       : num  105805 105811 88254
##  $ IND                      : num  1 1 9005
##  $ REFCTR                   : logi  NA NA NA
##  $ EXAM_DATE                : POSIXct, format: "2023-03-17" "2024-03-18" ...
##  $ EXAMINER                 : chr  "gsv32" "mxp1257" "gsv32"
##  $ DATE_OF_BIRTH            : POSIXct, format: "1955-09-07" "1950-03-22" ...
##  $ AGE_AT_EXAM              : num  67 73 62
##  $ REVIEW_DATE              : logi  NA NA NA
##  $ REVIEWER                 : logi  NA NA NA
##  $ MINT1A                   : logi  NA NA NA
##  $ MINT1B                   : num  1 1 1
##  $ MINT1C                   : logi  NA NA NA
##  $ MINT1D                   : num  1 1 1
##  $ MINT1F                   : logi  NA NA NA
##  $ BUTTERFLY_OTHER          : logi  NA NA NA
##  $ MINT2A                   : logi  NA NA NA
##  $ MINT2B                   : num  1 1 1
##  $ MINT2C                   : logi  NA NA NA
##  $ MINT2D                   : num  1 1 1
##  $ MINT2F                   : logi  NA NA NA
##  $ GLOVE_OTHER              : logi  NA NA NA
##  $ MINT3A                   : logi  NA NA NA
##  $ MINT3B                   : num  1 1 1
##  $ MINT3C                   : logi  NA NA NA
##  $ MINT3D                   : num  1 1 1
##  $ MINT3F                   : logi  NA NA NA
##  $ LIGHTBULB_OTHER          : logi  NA NA NA
##  $ MINT4A                   : logi  NA NA NA
##  $ MINT4B                   : num  1 1 1
##  $ MINT4C                   : logi  NA NA NA
##  $ MINT4D                   : num  1 1 1
##  $ MINT4F                   : logi  NA NA NA
##  $ WATCH_OTHER              : logi  NA NA NA
##  $ MINT5A                   : logi  NA NA NA
##  $ MINT5B                   : num  1 1 1
##  $ MINT5C                   : logi  NA NA NA
##  $ MINT5D                   : num  1 1 1
##  $ MINT5F                   : logi  NA NA NA
##  $ CANDLE_OTHER             : logi  NA NA NA
##  $ MINT6A                   : logi  NA NA NA
##  $ MINT6B                   : num  1 1 1
##  $ MINT6C                   : logi  NA NA NA
##  $ MINT6D                   : num  1 1 1
##  $ MINT6F                   : logi  NA NA NA
##  $ CLOWN_OTHER              : logi  NA NA NA
##  $ MINT7A                   : logi  NA NA NA
##  $ MINT7B                   : num  1 1 1
##  $ MINT7C                   : logi  NA NA NA
##  $ MINT7D                   : num  1 1 1
##  $ MINT7F                   : logi  NA NA NA
##  $ KITE_OTHER               : logi  NA NA NA
##  $ MINT8A                   : logi  NA NA NA
##  $ MINT8B                   : num  1 1 1
##  $ MINT8C                   : logi  NA NA NA
##  $ MINT8D                   : num  1 1 1
##  $ MINT8F                   : logi  NA NA NA
##  $ RAINBOW_OTHER            : logi  NA NA NA
##  $ MINT9A                   : logi  NA NA NA
##  $ MINT9B                   : num  1 1 1
##  $ MINT9C                   : logi  NA NA NA
##  $ MINT9D                   : num  1 1 1
##  $ MINT9F                   : logi  NA NA NA
##  $ WITCH_OTHER              : logi  NA NA NA
##  $ MINT10A                  : logi  NA NA NA
##  $ MINT10B                  : num  1 1 1
##  $ MINT10C                  : logi  NA NA NA
##  $ MINT10D                  : num  1 1 1
##  $ MINT10F                  : logi  NA NA NA
##  $ SEESAW_OTHER             : logi  NA NA NA
##  $ MINT11A                  : logi  NA NA NA
##  $ MINT11B                  : num  1 1 1
##  $ MINT11C                  : logi  NA NA NA
##  $ MINT11D                  : num  1 1 1
##  $ MINT11F                  : logi  NA NA NA
##  $ FLASHLIGHT_OTHER         : logi  NA NA NA
##  $ MINT12A                  : logi  NA NA NA
##  $ MINT12B                  : num  1 1 1
##  $ MINT12C                  : logi  NA NA NA
##  $ MINT12D                  : num  1 1 1
##  $ MINT12F                  : logi  NA NA NA
##  $ PEACOCK_OTHER            : logi  NA NA NA
##  $ MINT13A                  : logi  NA NA NA
##  $ MINT13B                  : num  1 1 1
##  $ MINT13C                  : logi  NA NA NA
##  $ MINT13D                  : num  1 1 1
##  $ MINT13F                  : logi  NA NA NA
##  $ SNAIL_OTHER              : logi  NA NA NA
##  $ MINT14A                  : logi  NA NA NA
##  $ MINT14B                  : num  1 1 1
##  $ MINT14C                  : logi  NA NA NA
##  $ MINT14D                  : num  1 1 1
##  $ MINT14F                  : logi  NA NA NA
##  $ WHALE_OTHER              : logi  NA NA NA
##  $ MINT15A                  : logi  NA NA NA
##  $ MINT15B                  : num  1 1 1
##  $ MINT15C                  : logi  NA NA NA
##  $ MINT15D                  : num  1 1 1
##  $ MINT15F                  : logi  NA NA NA
##  $ CAGE_OTHER               : logi  NA NA NA
##  $ MINT16A                  : logi  NA NA NA
##  $ MINT16B                  : num  1 1 1
##  $ MINT16C                  : logi  NA NA NA
##  $ MINT16D                  : num  1 1 1
##  $ MINT16F                  : logi  NA NA NA
##  $ NEST_OTHER               : logi  NA NA NA
##  $ MINT17A                  : logi  NA NA NA
##  $ MINT17B                  : num  1 1 1
##  $ MINT17C                  : logi  NA NA NA
##  $ MINT17D                  : num  1 1 1
##  $ MINT17F                  : logi  NA NA NA
##  $ PLUG_OTHER               : logi  NA NA NA
##  $ MINT18A                  : logi  NA NA NA
##  $ MINT18B                  : num  1 1 1
##  $ MINT18C                  : logi  NA NA NA
##  $ MINT18D                  : num  1 1 1
##  $ MINT18F                  : logi  NA NA NA
##  $ WIG_OTHER                : logi  NA NA NA
##  $ MINT19A                  : logi  NA NA NA
##  $ MINT19B                  : num  1 1 1
##  $ MINT19C                  : logi  NA NA NA
##  $ MINT19D                  : num  1 1 1
##  $ MINT19F                  : logi  NA NA NA
##  $ SCREW_OTHER              : logi  NA NA NA
##  $ MINT20A                  : logi  NA NA NA
##  $ MINT20B                  : num  1 1 1
##  $ MINT20C                  : logi  NA NA NA
##  $ MINT20D                  : num  1 1 1
##  $ MINT20F                  : logi  NA NA NA
##  $ SCARF_OTHER              : logi  NA NA NA
##  $ MINT21A                  : logi  NA NA NA
##  $ MINT21B                  : num  1 1 1
##  $ MINT21C                  : logi  NA NA NA
##  $ MINT21D                  : num  1 1 1
##  $ MINT21F                  : logi  NA NA NA
##  $ WELL_OTHER               : logi  NA NA NA
##  $ MINT22A                  : logi  NA NA NA
##  $ MINT22B                  : num  1 1 1
##  $ MINT22C                  : logi  NA NA NA
##  $ MINT22D                  : num  1 1 1
##  $ MINT22F                  : logi  NA NA NA
##  $ DUSTPAN_OTHER            : logi  NA NA NA
##  $ MINT23A                  : logi  NA NA NA
##  $ MINT23B                  : num  1 1 1
##  $ MINT23C                  : logi  NA NA NA
##  $ MINT23D                  : num  1 1 1
##  $ MINT23F                  : logi  NA NA NA
##  $ PARACHUTE_OTHER          : logi  NA NA NA
##  $ MINT24A                  : num  NA 1 NA
##  $ MINT24B                  : num  1 NA 1
##  $ MINT24C                  : num  NA 1 NA
##  $ MINT24D                  : num  1 1 1
##  $ MINT24F                  : logi  NA NA NA
##  $ BLIND_OTHER              : chr  NA "BALLENA" NA
##  $ MINT25A                  : logi  NA NA NA
##  $ MINT25B                  : num  1 1 1
##  $ MINT25C                  : logi  NA NA NA
##  $ MINT25D                  : num  1 1 1
##  $ MINT25F                  : logi  NA NA NA
##  $ HINGE_OTHER              : logi  NA NA NA
##  $ MINT26A                  : logi  NA NA NA
##  $ MINT26B                  : num  1 1 1
##  $ MINT26C                  : logi  NA NA NA
##  $ MINT26D                  : num  1 1 1
##  $ MINT26F                  : logi  NA NA NA
##  $ FUNNEL_OTHER             : logi  NA NA NA
##  $ MINT27A                  : num  NA 1 NA
##  $ MINT27B                  : num  1 NA 1
##  $ MINT27C                  : num  NA 1 NA
##  $ MINT27D                  : num  1 1 1
##  $ MINT27F                  : logi  NA NA NA
##  $ GAUGE_OTHER              : chr  NA "BISAGRA" NA
##  $ MINT28A                  : num  NA 1 NA
##  $ MINT28B                  : num  1 NA 1
##  $ MINT28C                  : num  NA 0 NA
##  $ MINT28D                  : num  1 0 1
##  $ MINT28F                  : num  NA 0 NA
##  $ PORTHOLE_OTHER           : chr  NA "NONE" NA
##  $ MINT29A                  : num  1 1 NA
##  $ MINT29B                  : num  NA NA 1
##  $ MINT29C                  : num  0 1 NA
##  $ MINT29D                  : num  0 1 1
##  $ MINT29F                  : num  0 NA NA
##  $ ANVIL_OTHER              : chr  "doesn't know" "yunque" NA
##  $ MINT30A                  : logi  NA NA NA
##  $ MINT30B                  : num  1 1 1
##  $ MINT30C                  : logi  NA NA NA
##  $ MINT30D                  : num  1 1 1
##  $ MINT30F                  : logi  NA NA NA
##  $ MORTAR_OTHER             : logi  NA NA NA
##  $ MINT31A                  : num  NA 1 NA
##  $ MINT31B                  : num  1 NA 1
##  $ MINT31C                  : num  NA 0 NA
##  $ MINT31D                  : num  1 0 1
##  $ MINT31F                  : num  NA 0 NA
##  $ PESTLE_OTHER             : chr  NA "none" NA
##  $ MINT32A                  : logi  NA NA NA
##  $ MINT32B                  : num  1 1 1
##  $ MINT32C                  : logi  NA NA NA
##  $ MINT32D                  : num  1 1 1
##  $ MINT32F                  : logi  NA NA NA
##  $ AXLE_OTHER               : logi  NA NA NA
##  $ COMMENTS_MINT            : logi  NA NA NA
##  $ MINT_TOT_NO_CUE          : num  31 27 32
##  $ MINT_STIM_CUE            : num  0 3 0
##  $ MINT_PHON_CUE            : num  1 2 0
##  $ MINT_CORR_PHON_CUE       : num  0 0 0
##  $ MINT_CORR_STIM_CUE       : num  31 30 32
##  $ MINT_CORR_STIM_CUE_STATUS: logi  NA NA NA


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "MINT_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 115 vars 

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 115 × 2
##    VarNames        `Data Type`
##    <chr>           <chr>      
##  1 REFCTR          VARCHAR2(6)
##  2 REVIEW_DATE     date       
##  3 REVIEWER        CHAR       
##  4 MINT1A          NUMBER(1)  
##  5 MINT1C          NUMBER(1)  
##  6 MINT1F          NUMBER(1)  
##  7 BUTTERFLY_OTHER CHAR       
##  8 MINT2A          NUMBER(1)  
##  9 MINT2C          NUMBER(1)  
## 10 MINT2F          NUMBER(1)  
## # ℹ 105 more rows
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 31 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,,ignore.case = T)] ## 1 var
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,,ignore.case = T)] ## 83 vars

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2date], as.numeric)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-03-17    1955-09-07
## 2 2024-03-18    1950-03-22
## 3 2023-01-18    1960-05-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 42 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 176 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;"           
## [5] "0;\r\n1;"      "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

MINT_RC <- df



MINT_SP_RC

df <- MINT_SP_RC

info(MINT_SP_RC,"SYSIND")
## #obs:303, cols:221, inds:301
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    303 obs. of  221 variables:
##  $ SYSXM                       : num  8260003 8260193 8277393 8278083 8260823 ...
##  $ SYSIND                      : num  11163453 11620563 11620453 11621333 11621203 ...
##  $ SYSGP                       : num  7924953 8005633 8005523 8006293 8006163 ...
##  $ SYSGPSTUDY                  : num  1363063 1452343 1452233 1453003 1452873 ...
##  $ SYSINDGP                    : num  7926663 8389633 8389523 8390403 8390273 ...
##  $ CGI_ORDER                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER                : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                      : chr  "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ DB_OWNER                    : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                       : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                    : chr  "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
##  $ CENTER                      : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                          : num  87923 104477 104476 104528 104455 ...
##  $ IND                         : num  9000 1 1 1 1 1 1 1 1 1 ...
##  $ REFCTR                      : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                   : POSIXct, format: "2023-10-25" "2023-05-15" ...
##  $ EXAMINER                    : chr  "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH               : POSIXct, format: "1967-06-15" "1949-12-01" ...
##  $ AGE_AT_EXAM                 : num  56 73 73 86 81 77 67 80 74 73 ...
##  $ REVIEW_DATE                 : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                    : logi  NA NA NA NA NA NA ...
##  $ MINT1A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT1B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT1C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT1D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT1F_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ TAMBOR_OTHER_SP             : chr  NA NA NA NA ...
##  $ MINT2A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT2B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT2C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT2D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT2F_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ GLOVE_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT3A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT3B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT3C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT3D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT3F_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ LIGHTBULB_OTHER_SP          : chr  NA NA NA NA ...
##  $ MINT4A_SP                   : num  NA NA NA 1 1 NA NA NA NA 1 ...
##  $ MINT4B_SP                   : num  1 1 1 NA NA 1 1 1 1 NA ...
##  $ MINT4C_SP                   : num  NA NA NA 1 1 NA NA NA NA 0 ...
##  $ MINT4D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT4F_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ WATCH_OTHER_SP              : chr  NA NA NA "-" ...
##  $ MINT5A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT5B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT5C_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT5D_SP                   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT5F_SP                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CANDLE_OTHER_SP             : chr  NA NA NA NA ...
##  $ MINT6A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT6B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT6C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT6D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT6F_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ CLOWN_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT7A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT7B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT7C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT7D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT7F_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ KITE_OTHER_SP               : chr  NA NA NA NA ...
##  $ MINT8A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT8B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT8C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT8D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT8F_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ RAINBOW_OTHER_SP            : chr  NA NA NA NA ...
##  $ MINT9A_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT9B_SP                   : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT9C_SP                   : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT9D_SP                   : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT9F_SP                   : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ WITCH_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT10A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT10B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT10C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT10D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT10F_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ SEESAW_OTHER_SP             : chr  NA NA NA NA ...
##  $ MINT11A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT11B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT11C_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT11D_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT11F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FLASHLIGHT_OTHER_SP         : chr  NA NA NA NA ...
##  $ MINT12A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT12B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT12C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT12D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT12F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ PEACOCK_OTHER_SP            : chr  NA NA NA NA ...
##  $ MINT13A_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MINT13B_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT13C_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MINT13D_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT13F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SNAIL_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT14A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT14B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT14C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT14D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT14F_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ WHALE_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT15A_SP                  : num  NA NA 1 NA 1 NA NA NA NA 1 ...
##  $ MINT15B_SP                  : num  1 1 NA 1 NA 1 1 1 1 NA ...
##  $ MINT15C_SP                  : num  NA NA 1 NA 1 NA NA NA NA 1 ...
##  $ MINT15D_SP                  : num  NA NA 1 NA 1 NA NA NA NA 1 ...
##  $ MINT15F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CAGE_OTHER_SP               : chr  NA NA "---" NA ...
##  $ MINT16A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT16B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT16C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT16D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT16F_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ NEST_OTHER_SP               : chr  NA NA NA NA ...
##  $ MINT17A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT17B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT17C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT17D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT17F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ PLUG_OTHER_SP               : chr  NA NA NA NA ...
##  $ MINT18A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT18B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT18C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT18D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT18F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ WIG_OTHER_SP                : chr  NA NA NA NA ...
##  $ MINT19A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT19B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT19C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT19D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT19F_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ SCREW_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT20A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT20B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT20C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT20D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT20F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ SCARF_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT21A_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MINT21B_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT21C_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MINT21D_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT21F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ WELL_OTHER_SP               : chr  NA NA NA NA ...
##  $ MINT22A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT22B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT22C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT22D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT22F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ DUSTPAN_OTHER_SP            : chr  NA NA NA NA ...
##  $ MINT23A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT23B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT23C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT23D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT23F_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ PARACHUTE_OTHER_SP          : chr  NA NA NA NA ...
##  $ MINT24A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT24B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT24C_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ MINT24D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT24F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ BLIND_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT25A_SP                  : num  NA NA NA NA NA NA 1 NA NA 1 ...
##  $ MINT25B_SP                  : num  1 1 1 1 1 1 NA 1 1 NA ...
##  $ MINT25C_SP                  : num  NA NA NA NA NA NA 0 NA NA 1 ...
##  $ MINT25D_SP                  : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ MINT25F_SP                  : num  NA NA NA NA NA NA 1 NA NA NA ...
##  $ HINGE_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT26A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT26B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT26C_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT26D_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT26F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ FUNNEL_OTHER_SP             : chr  NA NA NA NA ...
##  $ MINT27A_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT27B_SP                  : num  1 1 1 1 1 1 1 1 1 NA ...
##  $ MINT27C_SP                  : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ MINT27D_SP                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MINT27F_SP                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ GAUGE_OTHER_SP              : chr  NA NA NA NA ...
##  $ MINT28A_SP                  : num  1 NA NA NA 1 NA NA NA NA 1 ...
##  $ MINT28B_SP                  : num  NA 1 1 1 NA 1 1 1 1 NA ...
##  $ MINT28C_SP                  : num  0 NA NA NA 1 NA NA NA NA 1 ...
##  $ MINT28D_SP                  : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ MINT28F_SP                  : num  1 NA NA NA NA NA NA NA NA NA ...
##  $ PORTHOLE_OTHER_SP           : chr  "bisagra" NA NA NA ...
##  $ MINT29A_SP                  : num  1 1 NA NA NA NA 1 NA NA NA ...
##  $ MINT29B_SP                  : num  NA NA 1 1 1 1 NA 1 1 1 ...
##  $ MINT29C_SP                  : num  0 0 NA NA NA NA 0 NA NA NA ...
##  $ MINT29D_SP                  : num  0 0 1 1 1 1 0 1 1 1 ...
##  $ MINT29F_SP                  : num  0 0 NA NA NA NA 0 NA NA NA ...
##  $ ANVIL_OTHER_SP              : chr  "n/a" NA NA NA ...
##  $ MINT30A_SP                  : num  NA 1 NA NA NA NA NA NA NA 1 ...
##  $ MINT30B_SP                  : num  1 NA 1 1 1 1 1 1 1 NA ...
##  $ MINT30C_SP                  : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ MINT30D_SP                  : num  1 0 1 1 1 1 1 1 1 0 ...
##  $ MINT30F_SP                  : num  NA 0 NA NA NA NA NA NA NA 0 ...
##  $ MORTAR_OTHER_SP             : chr  NA NA NA NA ...
##  $ MINT31A_SP                  : num  1 1 NA NA NA NA 1 NA NA 1 ...
##  $ MINT31B_SP                  : num  NA NA 1 1 1 1 NA 1 1 NA ...
##  $ MINT31C_SP                  : num  0 0 NA NA NA NA 0 NA NA 0 ...
##  $ MINT31D_SP                  : num  0 0 1 1 1 1 0 1 1 0 ...
##  $ MINT31F_SP                  : num  0 0 NA NA NA NA 0 NA NA 0 ...
##  $ PESTLE_OTHER_SP             : chr  "n/a" NA NA NA ...
##  $ MINT32A_SP                  : num  NA NA NA NA NA NA 1 NA NA 1 ...
##  $ MINT32B_SP                  : num  1 1 1 1 1 1 NA 1 1 NA ...
##  $ MINT32C_SP                  : num  NA NA NA NA NA NA 1 NA NA 0 ...
##  $ MINT32D_SP                  : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ MINT32F_SP                  : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ AXLE_OTHER_SP               : chr  NA NA NA NA ...
##  $ COMMENTS_MINT_SP            : chr  NA NA NA NA ...
##  $ MINT_TOT_NO_CUE_SP          : num  29 29 31 31 29 32 28 32 32 3 ...
##  $ MINT_STIM_CUE_SP            : num  0 0 1 1 3 0 1 0 0 7 ...
##  $ MINT_PHON_CUE_SP            : num  3 3 0 0 0 0 3 0 0 22 ...
##  $ MINT_CORR_PHON_CUE_SP       : num  1 0 0 0 0 0 1 0 0 8 ...
##  $ MINT_CORR_STIM_CUE_SP       : num  29 29 32 32 32 32 29 32 32 10 ...
##  $ MINT_CORR_STIM_CUE_SP_STATUS: chr  NA NA NA NA ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "MINT_SP_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 2 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,,ignore.case = T)] ## 1 var

## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2023-10-25    1967-06-15
## 2 2023-05-15    1949-12-01
## 3 2023-05-15    1950-04-02
## 4 2023-05-09    1936-05-22
## 5 2023-02-24    1941-10-04
## 6 2023-08-11    1946-06-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 42 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 176 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;"           
## [5] "0;\r\n1;"      "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

MINT_SP_RC <- df



MOCA_RC

df <- MOCA_RC

info(MOCA_RC,"SYSIND")
## #obs:585, cols:140, inds:580
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    585 obs. of  140 variables:
##  $ SYSXM                         : num  8258783 8258823 8259093 8260053 8260123 ...
##  $ SYSIND                        : num  11037673 11369813 11024163 11620563 11362953 ...
##  $ SYSGP                         : num  7894423 7952013 7889113 8005633 7946353 ...
##  $ SYSGPSTUDY                    : num  1309743 1397123 1304453 1452343 1387463 ...
##  $ SYSINDGP                      : num  7793413 8139083 7779783 8389633 8132223 ...
##  $ CGI_ORDER                     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER                     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER                  : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY                        : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ DB_OWNER                      : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                         : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY                      : chr  "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ CENTER                        : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                            : num  87650 88301 87536 104477 87545 ...
##  $ IND                           : num  9000 1 112 1 106 ...
##  $ REFCTR                        : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE                     : POSIXct, format: "2023-10-24" "2024-02-13" ...
##  $ EXAMINER                      : chr  "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
##  $ DATE_OF_BIRTH                 : POSIXct, format: "1954-10-29" "1947-05-13" ...
##  $ AGE_AT_EXAM                   : num  68 76 79 73 66 81 86 73 81 60 ...
##  $ REVIEW_DATE                   : logi  NA NA NA NA NA NA ...
##  $ REVIEWER                      : logi  NA NA NA NA NA NA ...
##  $ MOCALOC                       : num  2 2 3 3 2 2 3 2 2 2 ...
##  $ MOCALOC_OTHER                 : chr  NA NA NA NA ...
##  $ MOCALAN                       : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ MOCALANX                      : logi  NA NA NA NA NA NA ...
##  $ MOCATRAI                      : num  0 1 1 1 0 1 1 1 1 1 ...
##  $ MOCACUBE                      : num  0 0 1 0 0 0 0 0 1 1 ...
##  $ MOCACLOC                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCACLON                      : num  1 0 1 1 1 1 1 0 0 1 ...
##  $ MOCACLOH                      : num  1 0 1 1 0 0 0 1 1 0 ...
##  $ MOCANAMI_LION                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCANAMI_LION_OTH             : chr  NA NA NA NA ...
##  $ MOCANAMI_RHINO                : num  1 0 0 1 1 1 1 1 1 1 ...
##  $ MOCANAMI_RHINO_OTH            : chr  NA NA NA NA ...
##  $ MOCANAMI_CAMEL                : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ MOCANAMI_CAMEL_OTH            : chr  NA NA NA NA ...
##  $ MOCAREGI1                     : num  0 0 1 1 1 1 0 1 1 1 ...
##  $ MOCAREGI2                     : num  1 1 1 0 1 0 0 1 1 1 ...
##  $ MOCAREGI3                     : num  1 1 0 0 0 1 1 1 1 0 ...
##  $ MOCAREGI4                     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAREGI5                     : num  1 0 1 1 1 1 1 0 0 1 ...
##  $ MOCAREGI6                     : num  1 1 1 1 1 0 1 1 1 1 ...
##  $ MOCAREGI7                     : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAREGI8                     : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ MOCAREGI9                     : num  1 1 0 1 1 1 0 1 1 1 ...
##  $ MOCAREGI10                    : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ MOCADIGI_FORW                 : num  1 1 1 1 1 0 0 1 1 1 ...
##  $ MOCADIGI_FORW_INCORRECT       : chr  NA NA NA NA ...
##  $ MOCADIGI_BACK                 : num  1 1 1 0 1 1 0 0 1 1 ...
##  $ MOCADIGI_BACK_INCORRECT       : chr  NA NA NA NA ...
##  $ MOCALETT                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCASER7_93                   : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ MOCASER7_93_OTH               : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_86                   : num  0 0 1 1 0 1 1 0 1 1 ...
##  $ MOCASER7_86_OTH               : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_79                   : num  1 0 1 1 1 1 1 0 0 1 ...
##  $ MOCASER7_79_OTH               : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_72                   : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ MOCASER7_72_OTH               : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_65                   : num  0 0 1 1 0 0 1 0 0 1 ...
##  $ MOCASER7_65_OTH               : logi  NA NA NA NA NA NA ...
##  $ MOCAREPE_1                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAREPE_2                    : num  1 0 1 1 0 0 0 1 1 1 ...
##  $ MOCAFLUEF_60SEC               : chr  "finca, feo, flor, farmacia, fosforo, freno, ficticio" "FRACCION FAMILIA FRACCION (X) FUERTE" "Feo Fricos Farmacia Faro" "FALSO FEO FRIALDAD FENOMENO FACILIDAD FELICIDAD" ...
##  $ MOCAFLUE_SCORE                : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ MOCAABST_TRAIN                : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ MOCAABST_RULER                : num  1 1 1 1 0 1 0 1 1 1 ...
##  $ MOCARECN_1                    : num  1 NA NA NA 1 NA NA NA NA NA ...
##  $ MOCARECN_2                    : num  1 NA NA NA NA NA NA NA 1 1 ...
##  $ MOCARECN_3                    : num  1 NA NA NA 1 NA 1 NA NA 1 ...
##  $ MOCARECN_4                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MOCARECN_5                    : num  NA NA NA NA NA NA 1 NA 1 NA ...
##  $ MOCARECC_1                    : num  NA NA NA 1 NA NA NA NA NA NA ...
##  $ MOCARECC_2                    : num  NA NA NA 1 1 NA 1 NA NA NA ...
##  $ MOCARECC_3                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MOCARECC_4                    : num  NA NA NA 1 NA NA 1 NA NA NA ...
##  $ MOCARECC_5                    : num  1 1 NA 1 NA 1 NA NA NA NA ...
##  $ MOCARECR_1                    : num  NA 1 1 NA NA NA 1 1 1 1 ...
##  $ MOCARECR_2                    : num  NA 1 1 NA NA NA NA 1 NA NA ...
##  $ MOCARECR_3                    : num  NA 1 1 1 NA 1 NA 1 1 NA ...
##  $ MOCARECR_4                    : num  1 1 1 NA 1 1 NA NA 1 1 ...
##  $ MOCARECR_5                    : num  NA NA 1 NA 1 NA NA NA NA 1 ...
##  $ MOCARECN_REC1                 : num  NA NA NA NA NA 0 NA NA NA NA ...
##  $ MOCARECN_REC2                 : num  NA NA NA NA NA 0 NA NA NA NA ...
##  $ MOCARECN_REC3                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MOCARECN_REC4                 : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ MOCARECN_REC5                 : num  NA NA NA NA NA NA NA 0 NA NA ...
##  $ MOCAORDT_ENTRY                : POSIXct, format: "2023-10-23" "2024-02-13" ...
##  $ MOCAORDT                      : num  0 1 1 1 0 1 1 1 1 1 ...
##  $ MOCAORMO_ENTRY                : chr  NA NA NA NA ...
##  $ MOCAORMO                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAORYR_ENTRY                : chr  NA NA NA NA ...
##  $ MOCAORYR                      : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ MOCAORDY_ENTRY                : chr  NA NA NA NA ...
##  $ MOCAORDY                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAORPL_ENTRY                : chr  NA NA NA NA ...
##  $ MOCAORPL                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCAORCT_ENTRY                : chr  NA NA NA NA ...
##  $ MOCAORCT                      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MOCA_EDU                      : num  1 0 0 0 1 1 0 0 0 0 ...
##  $ NACC_MOCA                     : chr  NA NA NA NA ...
##  $ MOCAVISEXE_SCORE              : num  3 2 5 4 2 3 3 3 4 4 ...
##  $ MOCAVISEXE_SCORE_STATUS       : chr  NA NA NA NA ...
##  $ MOCANAMI_SCORE                : num  3 2 2 3 3 3 3 2 3 3 ...
##  $ MOCANAMI_SCORE_STATUS         : chr  NA NA NA NA ...
##  $ SCORE_REGISTRATION            : num  8 8 8 8 9 8 6 9 8 9 ...
##  $ SCORE_REGISTRATION_STATUS     : logi  NA NA NA NA NA NA ...
##  $ MOCADIGI_SCORE                : num  2 2 2 1 2 1 0 1 2 2 ...
##  $ MOCADIGI_SCORE_STATUS         : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_93_SCORE             : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ MOCASER7_93_SCORE_STATUS      : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_86_SCORE             : num  0 0 1 1 0 1 1 0 1 1 ...
##  $ MOCASER7_86_SCORE_SCORE_STATUS: logi  NA NA NA NA NA NA ...
##  $ MOCASER7_79_SCORE             : num  1 0 1 1 1 1 1 0 0 1 ...
##  $ MOCASER7_79_SCORE_STATUS      : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_72_SCORE             : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ MOCASER7_72_SCORE_STATUS      : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_65_SCORE             : num  0 0 1 1 0 0 1 0 0 1 ...
##  $ MOCASER7_65_SCORE_STATUS      : logi  NA NA NA NA NA NA ...
##  $ MOCASER7_SCORE                : num  3 0 5 5 3 4 5 1 3 5 ...
##  $ MOCASER7_SCORE_STATUS         : chr  NA NA NA NA ...
##  $ MOCASER7_POINTSCORE           : num  2 0 3 3 2 3 3 1 2 3 ...
##  $ MOCASER7_POINTSCORE_STATUS    : chr  NA NA NA NA ...
##  $ MOCAREPE_SCORE                : num  2 1 2 2 1 1 1 2 2 2 ...
##  $ MOCAREPE_SCORE_STATUS         : logi  NA NA NA NA NA NA ...
##  $ MOCAABST_SCORE                : num  2 2 2 2 0 1 1 2 2 2 ...
##  $ MOCAABST_SCORE_STATUS         : logi  NA NA NA NA NA NA ...
##  $ MOCARECN_SCORE                : num  3 0 0 0 2 0 2 0 2 2 ...
##  $ MOCARECN_SCORE_STATUS         : chr  "partial" NA NA NA ...
##  $ MOCARECC_SCORE                : num  1 1 0 4 1 1 2 0 0 0 ...
##  $ MOCARECC_SCORE_STATUS         : chr  "partial" "partial" NA "partial" ...
##  $ MOCARECR_SCORE                : num  1 4 5 1 2 2 1 3 3 3 ...
##  $ MOCARECR_SCORE_STATUS         : chr  "partial" "partial" NA "partial" ...
##  $ MOCAOR_SCORE                  : num  5 6 6 6 4 6 6 6 6 6 ...
##  $ MOCAOR_SCORE_STATUS           : chr  NA NA NA NA ...
##  $ MOCATOTS                      : num  23 16 23 22 18 19 20 19 25 26 ...
##  $ MOCATOTS_STATUS               : chr  "partial" "partial" "partial" "partial" ...
##  $ NACCMOCA                      : num  24 16 23 22 19 20 20 19 25 26 ...
##  $ NACCMOCA_STATUS               : chr  "partial" "partial" "partial" "partial" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "MOCA_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 18 × 2
##    VarNames                       `Data Type` 
##    <chr>                          <chr>       
##  1 REFCTR                         VARCHAR2(6) 
##  2 REVIEW_DATE                    date        
##  3 REVIEWER                       CHAR        
##  4 MOCALANX                       VARCHAR2(25)
##  5 MOCASER7_93_OTH                VARCHAR2(5) 
##  6 MOCASER7_86_OTH                VARCHAR2(5) 
##  7 MOCASER7_79_OTH                VARCHAR2(5) 
##  8 MOCASER7_72_OTH                VARCHAR2(5) 
##  9 MOCASER7_65_OTH                VARCHAR2(5) 
## 10 SCORE_REGISTRATION_STATUS      CHAR        
## 11 MOCADIGI_SCORE_STATUS          CHAR        
## 12 MOCASER7_93_SCORE_STATUS       CHAR        
## 13 MOCASER7_86_SCORE_SCORE_STATUS CHAR        
## 14 MOCASER7_79_SCORE_STATUS       CHAR        
## 15 MOCASER7_72_SCORE_STATUS       CHAR        
## 16 MOCASER7_65_SCORE_STATUS       CHAR        
## 17 MOCAREPE_SCORE_STATUS          CHAR        
## 18 MOCAABST_SCORE_STATUS          CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 17 vars

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"      "DATE_OF_BIRTH"  "MOCAORDT_ENTRY"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date","Date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH MOCAORDT_ENTRY
## 1 2023-10-24    1954-10-29     2023-10-23
## 2 2024-02-13    1947-05-13     2024-02-13
## 3 2023-10-25    1944-07-28     2023-10-25
## 4 2023-05-15    1949-12-01     2023-05-15
## 5 2024-02-20    1957-08-05     2024-02-20
## 6 2024-02-15    1942-09-30     2024-02-15
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 46 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0) 
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 90 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA               "1 thru 99999;"  "1 thru 9999;"   "1;\r\n2;\r\n3;"
## [5] "1;\r\n0;"       "1;\r\n0;\r\n"   "1;"             "0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

MOCA_RC <- df



NUMBER_SPAN_RC

df <- NUMBER_SPAN_RC

info(NUMBER_SPAN_RC,"SYSIND")
## #obs:527, cols:85, inds:522
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "logical"
## 
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
Click for details
## 'data.frame':    527 obs. of  85 variables:
##  $ SYSXM                : num  8276493 8258843 8258873 8260113 8277623 ...
##  $ SYSIND               : num  11369703 11369813 11037673 11620563 11435853 ...
##  $ SYSGP                : num  7951913 7952013 7894423 8005633 7962813 ...
##  $ SYSGPSTUDY           : num  1397023 1397123 1309743 1452343 1407923 ...
##  $ SYSINDGP             : num  8138973 8139083 7793413 8389633 8205123 ...
##  $ CGI_ORDER            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ GPS_ORDER            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ STDCGI_ORDER         : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ LSTUDY               : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ DB_OWNER             : chr  "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
##  $ STUDY                : chr  "ALZ" "ALZ" "ALZ" "ALZ" ...
##  $ SUBSTUDY             : chr  "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
##  $ CENTER               : chr  "IHG" "IHG" "IHG" "IHG" ...
##  $ GP                   : num  88299 88301 87650 104477 88452 ...
##  $ IND                  : num  1 1 9000 1 1 1 105 1 1 1 ...
##  $ REFCTR               : logi  NA NA NA NA NA NA ...
##  $ EXAM_DATE            : POSIXct, format: "2024-02-13" "2024-02-13" ...
##  $ EXAMINER             : chr  "gsv32" "jjs2031" "gsv32" "jjs2031" ...
##  $ DATE_OF_BIRTH        : POSIXct, format: "1944-09-22" "1947-05-13" ...
##  $ AGE_AT_EXAM          : num  79 76 68 73 81 86 71 73 81 79 ...
##  $ REVIEW_DATE          : logi  NA NA NA NA NA NA ...
##  $ REVIEWER             : logi  NA NA NA NA NA NA ...
##  $ SPF3_R1              : chr  "184" NA NA "184" ...
##  $ SPF3_1               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SPF3_R2              : chr  "279" NA NA "279" ...
##  $ SPF3_2               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SPF4_R1              : chr  "4162" NA NA "4162" ...
##  $ SPF4_1               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SPF4_R2              : chr  "8195" NA NA "8195" ...
##  $ SPF4_2               : num  1 1 1 1 0 1 0 1 1 1 ...
##  $ SPF5_R1              : chr  "64928" NA NA "64928" ...
##  $ SPF5_1               : num  1 1 1 1 0 1 0 0 1 0 ...
##  $ SPF5_R2              : chr  "73861" NA NA "73861" ...
##  $ SPF5_2               : num  1 1 1 1 0 1 0 1 0 1 ...
##  $ SPF6_R1              : chr  "392475" "39245" NA "392475" ...
##  $ SPF6_1               : num  1 0 1 1 NA 0 NA 0 0 0 ...
##  $ SPF6_R2              : chr  "628319" "628399" NA "628319" ...
##  $ SPF6_2               : num  1 0 1 1 NA 0 NA 0 0 0 ...
##  $ SPF7_R1              : chr  "9687156" NA NA "9647153" ...
##  $ SPF7_1               : num  0 NA 0 1 NA NA NA NA NA NA ...
##  $ SPF7_R2              : chr  "749281" NA NA "7492681" ...
##  $ SPF7_2               : num  0 NA 0 1 NA NA NA NA NA NA ...
##  $ SPF8_R1              : chr  NA NA NA "47528169" ...
##  $ SPF8_1               : num  NA NA NA 0 NA NA NA NA NA NA ...
##  $ SPF8_R2              : chr  NA NA NA "29753618" ...
##  $ SPF8_2               : num  NA NA NA 0 NA NA NA NA NA NA ...
##  $ SPF9_R1              : chr  NA NA NA NA ...
##  $ SPF9_1               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPF9_R2              : chr  NA NA NA NA ...
##  $ SPF9_2               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPF_LONGEST          : num  6 5 6 10 4 5 4 4 5 4 ...
##  $ SPB2_R1              : chr  "52" NA NA "52" ...
##  $ SPB2_1               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SPB2_R2              : chr  "74" NA NA "74" ...
##  $ SPB2_2               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SPB3_R1              : chr  "926" NA NA "692" ...
##  $ SPB3_1               : num  0 1 1 1 0 1 0 1 0 0 ...
##  $ SPB3_R2              : chr  "473" NA NA "473" ...
##  $ SPB3_2               : num  1 1 1 1 1 1 0 0 0 0 ...
##  $ SPB4_R1              : chr  "6761" NA NA "68176" ...
##  $ SPB4_1               : num  0 1 0 0 0 0 NA 0 NA NA ...
##  $ SPB4_R2              : chr  "1536" "351" NA "6315" ...
##  $ SPB4_2               : num  0 0 0 0 0 0 NA 0 NA NA ...
##  $ SPB5_R1              : chr  NA "9162" NA NA ...
##  $ SPB5_1               : num  NA 0 NA NA NA NA NA NA NA NA ...
##  $ SPB5_R2              : chr  NA "61927" NA NA ...
##  $ SPB5_2               : num  NA 0 NA NA NA NA NA NA NA NA ...
##  $ SPB6_R1              : chr  NA NA NA NA ...
##  $ SPB6_1               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB6_R2              : chr  NA NA NA NA ...
##  $ SPB6_2               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB7_R1              : chr  NA NA NA NA ...
##  $ SPB7_1               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB7_R2              : chr  NA NA NA NA ...
##  $ SPB7_2               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB8_R1              : chr  NA NA NA NA ...
##  $ SPB8_1               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB8_R2              : chr  NA NA NA NA ...
##  $ SPB8_2               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SPB_LONGEST          : num  3 4 3 4 3 3 2 3 2 2 ...
##  $ COMMENTS_SPF_SPB     : chr  NA NA NA NA ...
##  $ SPF_TOTALSCORE       : num  8 6 8 10 3 6 3 5 5 5 ...
##  $ SPF_TOTALSCORE_STATUS: chr  "partial" "partial" "partial" "partial" ...
##  $ SPB_TOTALSCORE       : num  3 5 4 4 3 4 2 3 2 2 ...
##  $ SPB_TOTALSCORE_STATUS: chr  "partial" "partial" "partial" "partial" ...


Pull the regenerated DD

dfDD <- read_excel(revisedDDpath, sheet = "NUMBER_SPAN_RC")


Handling Logical Variables

## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]

## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
##   VarNames    `Data Type`
##   <chr>       <chr>      
## 1 REFCTR      VARCHAR2(6)
## 2 REVIEW_DATE date       
## 3 REVIEWER    CHAR
## select the vars to be converted to date
convert2date <-  dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE

## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR"   "REVIEWER"

## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)

## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "POSIXct" "POSIXt" 
## 
## [[4]]
## [1] "Date"


Handling Date Variables

## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE"     "DATE_OF_BIRTH"

## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]

## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
##    EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13    1944-09-22
## 2 2024-02-13    1947-05-13
## 3 2023-10-24    1954-10-29
## 4 2023-05-15    1949-12-01
## 5 2024-02-15    1942-09-30
## 6 2023-05-09    1936-05-22
## convert format
df[datecols] <- lapply(df[datecols], as.Date)

## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric"   "character" "Date"


Handling Character Variables

## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 39 vars

## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]

mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)

## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.


Handling Numeric Variables

## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 43 vars

## extract numeric variables from DD

## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]

mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)

unique(numColsfromDD$`Valid Responses`)
## [1] NA              "1 thru 99999;" "1 thru 9999;"  "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]

DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP


Save Cleaned Data

NUMBER_SPAN_RC <- df



Duplicates Check

Duplicates Detection

# Get names of all data frames in the environment
longDfwithDuplicates <- c()
otherwithDuplicates <- c()
index = 0

## following function will do:
## 1. filter out the okay cross-sectional datasets
## 2. for the rest: return the longitudinal/cross-sectional dataset names if duplicates got detected
for (df_name in df_names) {
  df <- get(df_name)
  
  ## filter out the okay cross-sectional datasets
  if (length(unique(df[["SYSIND"]])) == nrow(df)) {
    index = index + 1
    cat(index,
        "No duplicates found in cross-sectional dataset: ",
        df_name,
        "\n")
  } else{
    ## Check if columns ID and Visit exist (longitduinal data or not)
    if (all(c("SYSIND", "EXAM_DATE") %in% colnames(df))) {
      # Find duplicates using dplyr
      dup_rows <- df %>%
        dplyr::group_by(SYSIND, EXAM_DATE) %>%
        dplyr::filter(n() > 1) %>%
        dplyr::ungroup()
      
      # If any duplicates found, assign to new data frame with _Duplicates
      if (nrow(dup_rows) > 0) {
        longDfwithDuplicates <- c(longDfwithDuplicates, df_name)
      } else {
        index = index + 1
        cat(index,
            "No duplicates found in longitudinal dataset: ",
            df_name,
            "\n")
      }
    } else {
      otherwithDuplicates <- c(otherwithDuplicates, df_name)
    }
  }
}
## 1 No duplicates found in longitudinal dataset:  AAAD_GERIAT 
## 2 No duplicates found in longitudinal dataset:  AAAD_MEDCON 
## 3 No duplicates found in longitudinal dataset:  AAAD_SOCIO_DEMO 
## 4 No duplicates found in longitudinal dataset:  AAAD_TRAILS 
## 5 No duplicates found in longitudinal dataset:  ALZ_B9_JUDGE_RC 
## 6 No duplicates found in longitudinal dataset:  ALZ_CSDD 
## 7 No duplicates found in cross-sectional dataset:  ALZ_GAI_SP 
## 8 No duplicates found in longitudinal dataset:  ALZ_NEURO_CDR 
## 9 No duplicates found in cross-sectional dataset:  ALZ_RPFQ 
## 10 No duplicates found in longitudinal dataset:  ALZ_SCREENING_RC 
## 11 No duplicates found in longitudinal dataset:  ALZ_STICK_D_RC 
## 12 No duplicates found in longitudinal dataset:  B4_CDR_RC 
## 13 No duplicates found in longitudinal dataset:  B5_NPIQ_RC 
## 14 No duplicates found in longitudinal dataset:  B6_GDS_RC 
## 15 No duplicates found in longitudinal dataset:  B7_FAS_RC 
## 16 No duplicates found in cross-sectional dataset:  BCF_RECOG_RC 
## 17 No duplicates found in cross-sectional dataset:  BCFCD_RC 
## 18 No duplicates found in cross-sectional dataset:  BCFCI_RC 
## 19 No duplicates found in cross-sectional dataset:  BILINGUAL_SCALE_RC 
## 20 No duplicates found in longitudinal dataset:  CAT_FLUENCY_RC 
## 21 No duplicates found in cross-sectional dataset:  CERAD_DEL_RC 
## 22 No duplicates found in cross-sectional dataset:  CERAD_IMM_RC 
## 23 No duplicates found in cross-sectional dataset:  CERAD_RECOG_RC 
## 24 No duplicates found in longitudinal dataset:  CRAFT_21_DEL_RC 
## 25 No duplicates found in longitudinal dataset:  CRAFT_21_IMM_RC 
## 26 No duplicates found in longitudinal dataset:  MEDCON_RC 
## 27 No duplicates found in longitudinal dataset:  MEDICAL_HIST 
## 28 No duplicates found in cross-sectional dataset:  MINT_RC 
## 29 No duplicates found in longitudinal dataset:  MINT_SP_RC 
## 30 No duplicates found in longitudinal dataset:  MOCA_RC 
## 31 No duplicates found in longitudinal dataset:  NUMBER_SPAN_RC


## longidtudinal datatset with duplicate
longDfwithDuplicates
## [1] "ALZ_NPIQ_CBRS"
## otherwithDuplicates
## the following variables do not have EXAM_DATE but have other time variables
## so I will check duplicates one by one for them based on their unique time variables
otherwithDuplicates
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM"        "ALZ_LOAD_COG"    "ALZ_NCRAD"      
## [5] "ALZ_SCREENING"   "CONSENSUS_DX"
## check duplicates for ALZ_CLINICALSUM
ALZ_CLINICALSUM %>%
  dplyr::group_by(SYSIND, FORM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_EXAM
ALZ_EXAM %>%
  dplyr::group_by(SYSIND, FORM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_LOAD_COG
ALZ_LOAD_COG %>%
  dplyr::group_by(SYSIND, INTERVIEW_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_NCRAD
ALZ_NCRAD %>%
  dplyr::group_by(SYSIND, FORM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 2 rows
## [1] 2
## ALZ_SCREENING
ALZ_SCREENING %>%
  dplyr::group_by(SYSIND, FORM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 0 row
## [1] 0
## CONSENSUS_DX
CONSENSUS_DX %>%
  dplyr::group_by(SYSIND, DATE_DX) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup() %>%
  nrow() %>% print() ## 211 rows
## [1] 211



Duplicates Handling Per Dataset

ALZ_NPIQ_CBRS

cat("Before duplicates handling - SYSIND*EXAM_DATE is: ",dupFixCheck(ALZ_NPIQ_CBRS,"SYSIND","EXAM_DATE"),"\n")
## Before duplicates handling - SYSIND*EXAM_DATE is:  122
info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:123, cols:116, inds:121
## view the duplicates
ALZ_NPIQ_CBRS %>%
  dplyr::group_by(SYSIND, EXAM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup()
## # A tibble: 2 × 116
##     SYSXM   SYSIND   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER STDCGI_ORDER
##     <dbl>    <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>        <dbl>
## 1 7540713 11048883 7896183    1311503  7804743         1         1           11
## 2 7540723 11048883 7896183    1311503  7804743         1         1           11
## # ℹ 108 more variables: LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAM_DATE <date>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   NPIQINF <chr>, NPIQINF_PRO <chr>, NPIQINF_OTH <chr>, NPIQINFA <dbl>,
## #   NPIQINFB <dbl>, NPIQTYPE <dbl>, AGIT <dbl>, AGITSEV <dbl>,
## #   AGITATION_DIST <dbl>, DEPD <dbl>, DEPDSEV <dbl>, DEPRESS_DIST <dbl>,
## #   ANX <dbl>, ANXSEV <dbl>, ANXIETY_DIST <dbl>, ELAT <dbl>, ELATSEV <dbl>, …
## after checking the duplicates, I decided to keep the second obs as it has less missingness
ALZ_NPIQ_CBRS <- ALZ_NPIQ_CBRS[ALZ_NPIQ_CBRS$SYSXM != "7540713", ]
info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:122, cols:116, inds:121
cat("After duplicates handling - SYSIND*EXAM_DATE is: ",dupFixCheck(ALZ_NPIQ_CBRS,"SYSIND","EXAM_DATE"),"\n")
## After duplicates handling - SYSIND*EXAM_DATE is:  122



ALZ_NCRAD

cat("Before duplicates handling - SYSIND*FORM_DATE is: ",dupFixCheck(ALZ_NCRAD,"SYSIND","FORM_DATE"),"\n")
## Before duplicates handling - SYSIND*FORM_DATE is:  742
info(ALZ_NCRAD,"SYSIND")
## #obs:743, cols:53, inds:742
## view the duplicates
ALZ_NCRAD %>%
  dplyr::group_by(SYSIND, FORM_DATE) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup()
## # A tibble: 2 × 53
##     SYSXM   SYSIND   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER STDCGI_ORDER
##     <dbl>    <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>        <dbl>
## 1 7906253 11009263 7889133    1304473  7764083         1         1           11
## 2 8388533 11009263 7889133    1304473  7764083         1         1           11
## # ℹ 45 more variables: LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   QUALIFY <chr>, FORM_DATE <date>, FILLED_OUT_BY <chr>, DATE_OF_BIRTH <date>,
## #   IN_NCRAD <chr>, SAMPLED <dbl>, EDUC <dbl>, VISIT <dbl>, COMREQ <dbl>,
## #   NOTDEMCI <dbl>, EVALMETH <dbl>, EVALYR <dbl>, CLDEMLEW <dbl>,
## #   COMDXAD <chr>, NONADDEM <dbl>, COMDXNAD <chr>, AAOSYMP <dbl>,
## #   STROKETY <dbl>, STROKEAGE <dbl>, HYPERAGE <dbl>, HEARTAGE <dbl>, …
## the duplicates are exactly same, so we can randomly drop one, I will drop the first observation
ALZ_NCRAD <- ALZ_NCRAD[ALZ_NCRAD$SYSXM != "7906253", ]
info(ALZ_NCRAD,"SYSIND")
## #obs:742, cols:53, inds:742
cat("After duplicates handling - SYSIND*FORM_DATE is: ",dupFixCheck(ALZ_NCRAD,"SYSIND","FORM_DATE"),"\n")
## After duplicates handling - SYSIND*FORM_DATE is:  742



CONSENSUS_DX

cat("Before duplicates handling - SYSIND*DATE_DX is: ",dupFixCheck(CONSENSUS_DX,"SYSIND","DATE_DX"),"\n")
## Before duplicates handling - SYSIND*DATE_DX is:  1700
info(CONSENSUS_DX,"SYSIND")
## #obs:1807, cols:43, inds:1584
## view the duplicates
dups_CONSENSUS_DX <- CONSENSUS_DX %>%
  dplyr::group_by(SYSIND, DATE_DX) %>%
  dplyr::filter(n() > 1) %>%
  dplyr::ungroup()

info(dups_CONSENSUS_DX,"SYSIND") 
## #obs:211, cols:43, inds:104
## some individuals have multiple CDX, and the RANK variable records the number of visits
## so I decided to use pivot_wider function to keep all the CDX values
## remove duplicates

IDcols <- c(names(CONSENSUS_DX)[1:16],"DATE_OF_BIRTH","DATE_DX",names(CONSENSUS_DX)[27:43])
IDcols
##  [1] "SYSXM"             "SYSIND"            "SYSGP"            
##  [4] "SYSGPSTUDY"        "SYSINDGP"          "CGI_ORDER"        
##  [7] "GPS_ORDER"         "STDCGI_ORDER"      "LSTUDY"           
## [10] "DB_OWNER"          "STUDY"             "SUBSTUDY"         
## [13] "CENTER"            "GP"                "IND"              
## [16] "REFCTR"            "DATE_OF_BIRTH"     "DATE_DX"          
## [19] "CLINICAL_COMMENTS" "OTHER_TXT1"        "OTHER_TXT2"       
## [22] "OTHER_TXT3"        "CALC_VAL1"         "CALC_VAL2"        
## [25] "CALC_VAL3"         "CALC_VAL4"         "CALC_VAL5"        
## [28] "CALC_VAL6"         "CALC_VAL7"         "CALC_VAL8"        
## [31] "CALC_VAL9"         "CALC_VAL10"        "CALC_VAL11"       
## [34] "LAST_SOURCE"       "OTHER_DATE1"
CONSENSUS_DX <- CONSENSUS_DX %>%
  pivot_wider(
    id_cols = all_of(IDcols),
    names_from = RANK,
    values_from = c(REVIEW_DATE, REVIEWER, RANK:WHO_DX,COMMENTS),
    names_sep = "_"
  )
info(CONSENSUS_DX,"SYSIND")
## #obs:1701, cols:59, inds:1584
cat("After duplicates handling - SYSIND*DATE_DX is: ",dupFixCheck(CONSENSUS_DX,"SYSIND","DATE_DX"),"\n")
## After duplicates handling - SYSIND*DATE_DX is:  1700



Individuals Count

Total Number of Individuals

## get total number of unique inviduals
all_ids <- unlist(
  lapply(df_names, function(d) get(d)$SYSIND)
)

# Count unique SYSINDs
all_ids <- unique(all_ids)
length(all_ids) ## 1994 individuals
## [1] 1994


Individual by Study Presence Check

# Initialize a matrix with colored HTML symbols
presence_mat <- sapply(df_names, function(dfn) {
  df <- get(dfn)  # get the dataset by name
  present <- all_ids %in% df$SYSIND
  ifelse(
    present,
    '<span style="color:darkgreen; font-weight:bold;">✔</span>',
    '<span style="color:darkred; font-weight:bold;">✘</span>'
  )
})

# Convert to data frame
presence_df <- data.frame(ID = all_ids, presence_mat, check.names = FALSE)

# columns to center (all except ID)
center_targets <- if (ncol(presence_df) > 1) 1:(ncol(presence_df) - 1) else integer(0)

DT::datatable(
  presence_df,
  escape = FALSE,
  rownames = FALSE,
  options = list(
    pageLength = 50,
    scrollX = TRUE,
    columnDefs = list(
      list(className = "dt-center", targets = center_targets) # 0-based indexing
    )
  )
)


Individuals Missingness Per Study

summary_df <- do.call(rbind, lapply(df_names, function(d) {
  df <- get(d)
  df_individuals <- unique(df$SYSIND)
  
  data.frame(
    dataset   = d,
    n_individuals = length(df_individuals),
    n_obs         = nrow(df),
    n_individials_missing   = length(setdiff(all_ids, df_individuals)),
    stringsAsFactors = FALSE
  )
}))

DT::datatable(summary_df)




Merge Sub-Datasets

Grouping Datasets

cross_dfs<- c()
long_dfs_wEXAM_DATE <- c()
long_dfs_woEXAM_DATE <- c()

for (df_name in df_names) {
  df_obj <- get(df_name)  # get the dataframe
  if (nrow(df_obj) == length(unique(df_obj[["SYSIND"]]))) {
    cross_dfs <- c(cross_dfs, df_name)
  } else if ("EXAM_DATE" %in% names(df_obj)) {
    long_dfs_wEXAM_DATE <- c(long_dfs_wEXAM_DATE, df_name)
  } else{
    long_dfs_woEXAM_DATE <- c(long_dfs_woEXAM_DATE,df_name)
  }
}

cross_dfs
##  [1] "ALZ_GAI_SP"         "ALZ_NCRAD"          "ALZ_RPFQ"          
##  [4] "BCF_RECOG_RC"       "BCFCD_RC"           "BCFCI_RC"          
##  [7] "BILINGUAL_SCALE_RC" "CERAD_DEL_RC"       "CERAD_IMM_RC"      
## [10] "CERAD_RECOG_RC"     "MINT_RC"
long_dfs_wEXAM_DATE
##  [1] "AAAD_GERIAT"      "AAAD_MEDCON"      "AAAD_SOCIO_DEMO"  "AAAD_TRAILS"     
##  [5] "ALZ_B9_JUDGE_RC"  "ALZ_CSDD"         "ALZ_NEURO_CDR"    "ALZ_NPIQ_CBRS"   
##  [9] "ALZ_SCREENING_RC" "ALZ_STICK_D_RC"   "B4_CDR_RC"        "B5_NPIQ_RC"      
## [13] "B6_GDS_RC"        "B7_FAS_RC"        "CAT_FLUENCY_RC"   "CRAFT_21_DEL_RC" 
## [17] "CRAFT_21_IMM_RC"  "MEDCON_RC"        "MEDICAL_HIST"     "MINT_SP_RC"      
## [21] "MOCA_RC"          "NUMBER_SPAN_RC"
long_dfs_woEXAM_DATE
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM"        "ALZ_LOAD_COG"    "ALZ_SCREENING"  
## [5] "CONSENSUS_DX"
dfwEXAM_DATE <- c()
dfwoEXAM_DATE <- c()

for (df_name in df_names) {
  df_obj <- get(df_name)  # get the dataframe
  
  if ("EXAM_DATE" %in% names(df_obj)) {
    dfwEXAM_DATE <- c(dfwEXAM_DATE, df_name)
  } else{
    dfwoEXAM_DATE <- c(dfwoEXAM_DATE,df_name)
  }
  
}

print(dfwEXAM_DATE)
##  [1] "AAAD_GERIAT"        "AAAD_MEDCON"        "AAAD_SOCIO_DEMO"   
##  [4] "AAAD_TRAILS"        "ALZ_B9_JUDGE_RC"    "ALZ_CSDD"          
##  [7] "ALZ_GAI_SP"         "ALZ_NEURO_CDR"      "ALZ_NPIQ_CBRS"     
## [10] "ALZ_RPFQ"           "ALZ_SCREENING_RC"   "ALZ_STICK_D_RC"    
## [13] "B4_CDR_RC"          "B5_NPIQ_RC"         "B6_GDS_RC"         
## [16] "B7_FAS_RC"          "BCF_RECOG_RC"       "BCFCD_RC"          
## [19] "BCFCI_RC"           "BILINGUAL_SCALE_RC" "CAT_FLUENCY_RC"    
## [22] "CERAD_DEL_RC"       "CERAD_IMM_RC"       "CERAD_RECOG_RC"    
## [25] "CRAFT_21_DEL_RC"    "CRAFT_21_IMM_RC"    "MEDCON_RC"         
## [28] "MEDICAL_HIST"       "MINT_RC"            "MINT_SP_RC"        
## [31] "MOCA_RC"            "NUMBER_SPAN_RC"
print(dfwoEXAM_DATE)
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM"        "ALZ_LOAD_COG"    "ALZ_NCRAD"      
## [5] "ALZ_SCREENING"   "CONSENSUS_DX"



AGE_AT_EXAM Variable Explore

## detect which dataset has AGE_AT_EXAM variable
index = 1

for (df_name in df_names) {
  df_obj <- get(df_name, inherits = TRUE)
  nms <- names(df_obj)

  has_age  <- "AGE_AT_EXAM" %in% nms
  has_date <- "EXAM_DATE"   %in% nms

  if (has_age && has_date) {
    cat(index, ":", df_name, ": both EXAM_DATE and AGE_AT_EXAM present\n")
  } else if (has_age && !has_date) {
    cat(index, ":", df_name, ": only AGE_AT_EXAM present\n")
  } else if (has_date && !has_age) {
    cat(index, ":", df_name, ": only EXAM_DATE present\n")
  } else {
    cat(index, ":", df_name, ": ============ none of them present ============\n")
  }

  index <- index + 1
}
## 1 : AAAD_GERIAT : both EXAM_DATE and AGE_AT_EXAM present
## 2 : AAAD_MEDCON : both EXAM_DATE and AGE_AT_EXAM present
## 3 : AAAD_SOCIO_DEMO : both EXAM_DATE and AGE_AT_EXAM present
## 4 : AAAD_TRAILS : both EXAM_DATE and AGE_AT_EXAM present
## 5 : ALZ_B9_JUDGE_RC : both EXAM_DATE and AGE_AT_EXAM present
## 6 : ALZ_CLINICALSUM : ============ none of them present ============
## 7 : ALZ_CSDD : both EXAM_DATE and AGE_AT_EXAM present
## 8 : ALZ_EXAM : ============ none of them present ============
## 9 : ALZ_GAI_SP : both EXAM_DATE and AGE_AT_EXAM present
## 10 : ALZ_LOAD_COG : ============ none of them present ============
## 11 : ALZ_NCRAD : ============ none of them present ============
## 12 : ALZ_NEURO_CDR : both EXAM_DATE and AGE_AT_EXAM present
## 13 : ALZ_NPIQ_CBRS : both EXAM_DATE and AGE_AT_EXAM present
## 14 : ALZ_RPFQ : both EXAM_DATE and AGE_AT_EXAM present
## 15 : ALZ_SCREENING : ============ none of them present ============
## 16 : ALZ_SCREENING_RC : both EXAM_DATE and AGE_AT_EXAM present
## 17 : ALZ_STICK_D_RC : both EXAM_DATE and AGE_AT_EXAM present
## 18 : B4_CDR_RC : both EXAM_DATE and AGE_AT_EXAM present
## 19 : B5_NPIQ_RC : both EXAM_DATE and AGE_AT_EXAM present
## 20 : B6_GDS_RC : both EXAM_DATE and AGE_AT_EXAM present
## 21 : B7_FAS_RC : both EXAM_DATE and AGE_AT_EXAM present
## 22 : BCF_RECOG_RC : both EXAM_DATE and AGE_AT_EXAM present
## 23 : BCFCD_RC : both EXAM_DATE and AGE_AT_EXAM present
## 24 : BCFCI_RC : both EXAM_DATE and AGE_AT_EXAM present
## 25 : BILINGUAL_SCALE_RC : both EXAM_DATE and AGE_AT_EXAM present
## 26 : CAT_FLUENCY_RC : both EXAM_DATE and AGE_AT_EXAM present
## 27 : CERAD_DEL_RC : both EXAM_DATE and AGE_AT_EXAM present
## 28 : CERAD_IMM_RC : both EXAM_DATE and AGE_AT_EXAM present
## 29 : CERAD_RECOG_RC : both EXAM_DATE and AGE_AT_EXAM present
## 30 : CONSENSUS_DX : ============ none of them present ============
## 31 : CRAFT_21_DEL_RC : both EXAM_DATE and AGE_AT_EXAM present
## 32 : CRAFT_21_IMM_RC : both EXAM_DATE and AGE_AT_EXAM present
## 33 : MEDCON_RC : both EXAM_DATE and AGE_AT_EXAM present
## 34 : MEDICAL_HIST : both EXAM_DATE and AGE_AT_EXAM present
## 35 : MINT_RC : both EXAM_DATE and AGE_AT_EXAM present
## 36 : MINT_SP_RC : both EXAM_DATE and AGE_AT_EXAM present
## 37 : MOCA_RC : both EXAM_DATE and AGE_AT_EXAM present
## 38 : NUMBER_SPAN_RC : both EXAM_DATE and AGE_AT_EXAM present

Findings: AGE_AT_EXAM and EXAM_DATE variables are paired. If one presents, the other also presents.

## group by SYSIND and AGE_AT_EXAM then check duplicates
index = 1
for (df_name in df_names) {
  df_obj <- get(df_name)  # get the dataframe
  if ("AGE_AT_EXAM" %in% names(df_obj)) {
    cat(index, " :", df_name, " : AGE_AT_EXAM present", "\n")
    
    dup_rows_AGE_AT_EXAM <- df_obj %>%
      dplyr::group_by(SYSIND, AGE_AT_EXAM) %>%
      dplyr::filter(n() > 1) %>%
      dplyr::ungroup()
    
    dup_rows_EXAM_DATE <- df_obj %>%
      dplyr::group_by(SYSIND, EXAM_DATE) %>%
      dplyr::filter(n() > 1) %>%
      dplyr::ungroup()
    
    if (nrow(dup_rows_AGE_AT_EXAM) > 0) {
      cat(index,
          " :",
          df_name,
          " : !!!duplicates present (checking by AGE_AT_EXAM )!!!",
          "\n")
    } else if (nrow(dup_rows_EXAM_DATE) > 0) {
      cat(index,
          " :",
          df_name,
          " : !!!duplicates present (checking by EXAM_DATE )!!!",
          "\n")
    } else{
      cat(index, " :", df_name, " : no duplicates found:)", "\n")
    }
    
  } else{
    cat(index, " :", df_name, "===============================================","\n")
  }
  index = index + 1
}
## 1  : AAAD_GERIAT  : AGE_AT_EXAM present 
## 1  : AAAD_GERIAT  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 2  : AAAD_MEDCON  : AGE_AT_EXAM present 
## 2  : AAAD_MEDCON  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 3  : AAAD_SOCIO_DEMO  : AGE_AT_EXAM present 
## 3  : AAAD_SOCIO_DEMO  : no duplicates found:) 
## 4  : AAAD_TRAILS  : AGE_AT_EXAM present 
## 4  : AAAD_TRAILS  : no duplicates found:) 
## 5  : ALZ_B9_JUDGE_RC  : AGE_AT_EXAM present 
## 5  : ALZ_B9_JUDGE_RC  : no duplicates found:) 
## 6  : ALZ_CLINICALSUM =============================================== 
## 7  : ALZ_CSDD  : AGE_AT_EXAM present 
## 7  : ALZ_CSDD  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 8  : ALZ_EXAM =============================================== 
## 9  : ALZ_GAI_SP  : AGE_AT_EXAM present 
## 9  : ALZ_GAI_SP  : no duplicates found:) 
## 10  : ALZ_LOAD_COG =============================================== 
## 11  : ALZ_NCRAD =============================================== 
## 12  : ALZ_NEURO_CDR  : AGE_AT_EXAM present 
## 12  : ALZ_NEURO_CDR  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 13  : ALZ_NPIQ_CBRS  : AGE_AT_EXAM present 
## 13  : ALZ_NPIQ_CBRS  : no duplicates found:) 
## 14  : ALZ_RPFQ  : AGE_AT_EXAM present 
## 14  : ALZ_RPFQ  : no duplicates found:) 
## 15  : ALZ_SCREENING =============================================== 
## 16  : ALZ_SCREENING_RC  : AGE_AT_EXAM present 
## 16  : ALZ_SCREENING_RC  : no duplicates found:) 
## 17  : ALZ_STICK_D_RC  : AGE_AT_EXAM present 
## 17  : ALZ_STICK_D_RC  : no duplicates found:) 
## 18  : B4_CDR_RC  : AGE_AT_EXAM present 
## 18  : B4_CDR_RC  : no duplicates found:) 
## 19  : B5_NPIQ_RC  : AGE_AT_EXAM present 
## 19  : B5_NPIQ_RC  : no duplicates found:) 
## 20  : B6_GDS_RC  : AGE_AT_EXAM present 
## 20  : B6_GDS_RC  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 21  : B7_FAS_RC  : AGE_AT_EXAM present 
## 21  : B7_FAS_RC  : no duplicates found:) 
## 22  : BCF_RECOG_RC  : AGE_AT_EXAM present 
## 22  : BCF_RECOG_RC  : no duplicates found:) 
## 23  : BCFCD_RC  : AGE_AT_EXAM present 
## 23  : BCFCD_RC  : no duplicates found:) 
## 24  : BCFCI_RC  : AGE_AT_EXAM present 
## 24  : BCFCI_RC  : no duplicates found:) 
## 25  : BILINGUAL_SCALE_RC  : AGE_AT_EXAM present 
## 25  : BILINGUAL_SCALE_RC  : no duplicates found:) 
## 26  : CAT_FLUENCY_RC  : AGE_AT_EXAM present 
## 26  : CAT_FLUENCY_RC  : no duplicates found:) 
## 27  : CERAD_DEL_RC  : AGE_AT_EXAM present 
## 27  : CERAD_DEL_RC  : no duplicates found:) 
## 28  : CERAD_IMM_RC  : AGE_AT_EXAM present 
## 28  : CERAD_IMM_RC  : no duplicates found:) 
## 29  : CERAD_RECOG_RC  : AGE_AT_EXAM present 
## 29  : CERAD_RECOG_RC  : no duplicates found:) 
## 30  : CONSENSUS_DX =============================================== 
## 31  : CRAFT_21_DEL_RC  : AGE_AT_EXAM present 
## 31  : CRAFT_21_DEL_RC  : no duplicates found:) 
## 32  : CRAFT_21_IMM_RC  : AGE_AT_EXAM present 
## 32  : CRAFT_21_IMM_RC  : no duplicates found:) 
## 33  : MEDCON_RC  : AGE_AT_EXAM present 
## 33  : MEDCON_RC  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 34  : MEDICAL_HIST  : AGE_AT_EXAM present 
## 34  : MEDICAL_HIST  : !!!duplicates present (checking by AGE_AT_EXAM )!!! 
## 35  : MINT_RC  : AGE_AT_EXAM present 
## 35  : MINT_RC  : no duplicates found:) 
## 36  : MINT_SP_RC  : AGE_AT_EXAM present 
## 36  : MINT_SP_RC  : no duplicates found:) 
## 37  : MOCA_RC  : AGE_AT_EXAM present 
## 37  : MOCA_RC  : no duplicates found:) 
## 38  : NUMBER_SPAN_RC  : AGE_AT_EXAM present 
## 38  : NUMBER_SPAN_RC  : no duplicates found:)

Findings: when mergeing, AGE_AT_EXAM can not be used as the key column, cause duplicates existed for some people with different AGE_AT_EXAM after rounding up, those obs have same AGE_AT_EXAM values.



Generate Visit Index Variable for Individuals

##########################################################################################
## for every individual, get all their date data
ppdat <- data.frame(SYSIND = numeric(0), EXAM_DATE = as.Date(character(0)))

## for dataset with EXAM_DATE, we just need to extract the relevant information and appending to the ppdat
for (df_name in dfwEXAM_DATE) {
  df_obj <- get(df_name)  # get the dataframe
  
  df_obj <- df_obj[,c("SYSIND","EXAM_DATE")]
  
  ppdat <- rbind(ppdat,df_obj)
}

ppdat <- ppdat[!duplicated(ppdat),]

## for dataset without EXAM_DATE, extract other data variables and appending to the ppdat

## ALZ_CLINICALSUM: has FORM_DATE
## ALZ_EXAM: has FORM_DATE
## ALZ_LOAD_COG: has INTERVIEW_DATE
## ALZ_NCRAD: has FORM_DATE
## ALZ_SCREENING: has FORM_DATE
## CONSENSUS_DX: DATE_DX

dfwFORM_DATE <- c("ALZ_CLINICALSUM","ALZ_EXAM","ALZ_NCRAD","ALZ_SCREENING")
for (df_name in dfwFORM_DATE) {
  df_obj <- get(df_name)  # get the dataframe
  
  df_obj <- df_obj[,c("SYSIND","FORM_DATE")]
  names(df_obj) <- c("SYSIND","EXAM_DATE")
  
  ppdat <- rbind(ppdat,df_obj)
}

ppdat <- ppdat[!duplicated(ppdat),]

df_obj <- ALZ_LOAD_COG
df_obj <- df_obj[,c("SYSIND","INTERVIEW_DATE")]
names(df_obj) <- c("SYSIND","EXAM_DATE")
ppdat <- rbind(ppdat,df_obj)
ppdat <- ppdat[!duplicated(ppdat),]

df_obj <- CONSENSUS_DX
df_obj <- df_obj[,c("SYSIND","DATE_DX")]
names(df_obj) <- c("SYSIND","EXAM_DATE")
ppdat <- rbind(ppdat,df_obj)

ppdat <- ppdat[!duplicated(ppdat),]

info(ppdat,"SYSIND") #obs:5204, cols:2, inds:1994
## #obs:5204, cols:2, inds:1994
sorted_ppdat <- ppdat %>% arrange(SYSIND, EXAM_DATE)

##########################################################################################
## check the time range between visits and get the distribution plot

# Step 1: Calculate intervals per individual
sorted_ppdat_intervals <- sorted_ppdat %>%
  group_by(SYSIND) %>%
  mutate(interval_days = as.numeric(difftime(EXAM_DATE, lag(EXAM_DATE), units = "days"))) %>%
  mutate(interval_month = interval_days/30.436875) %>%
  ungroup()

# Step 2: Look at the intervals (excluding first visit per subject which is NA)
DT::datatable(sorted_ppdat_intervals)
# Step 3: Plot histogram of all intervals
hist(sorted_ppdat_intervals$interval_month)

summary(sorted_ppdat_intervals$interval_month)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0329  2.4641  8.8872 12.3731 18.7930 96.0020    1994
##########################################################################################
## creating the Visit Index variable
## If a subject’s next exam is < 3 months after the last one, it stays in the same visit.
## If it’s ≥ 3 months, it becomes the next visit.
## Starts at 1 for each subject.
sorted_ppdat2 <- sorted_ppdat %>%
  group_by(SYSIND) %>%
  mutate(
    new_visit = is.na(lag(EXAM_DATE)) |
      EXAM_DATE >= (lag(EXAM_DATE) %m+% months(3)),
    Visit_Index = cumsum(new_visit) ## start a new visit when the gap from the previous exam is ≥ 3 months
  ) %>%
  select(-new_visit) %>%
  ungroup()

## Visit summary
visit_summary <- sorted_ppdat2 %>%
  group_by(SYSIND) %>%
  summarise(
    total_visits = n_distinct(Visit_Index),
    total_obs    = n(),                 # how many raw rows for this person
    first_date   = min(EXAM_DATE, na.rm = TRUE),
    last_date    = max(EXAM_DATE, na.rm = TRUE),
    span_days    = as.integer(last_date - first_date)
  ) %>%
  arrange(desc(total_visits))

DT::datatable(visit_summary)



Add Visit Index Back to Dataset

## add Visit index back to each dataset

## for dataset with EXAM_DATE variable:
index = 1
for (df_name in dfwEXAM_DATE) {
  
  cat("======================================================================","\n")
  df_obj <- get(df_name)  # get the dataframe
  
  cat(index,": Currently processing dataset: ",df_name,"\n")
  cat("Dataset dimension: ","\n")
  info(df_obj,"SYSIND")
  
  df_obj <- merge(df_obj,sorted_ppdat2,by=c("SYSIND","EXAM_DATE"))
  
  cat("After adding visit index, dataset dimension: ","\n")
  info(df_obj,"SYSIND")
  
  ## change dataset name
  newdfname <- paste0("wVisitIndex_",df_name)
  assign(newdfname, df_obj)
  
  index = index + 1
}
## ====================================================================== 
## 1 : Currently processing dataset:  AAAD_GERIAT 
## Dataset dimension:  
## #obs:1051, cols:62, inds:939 
## After adding visit index, dataset dimension:  
## #obs:1051, cols:63, inds:939 
## ====================================================================== 
## 2 : Currently processing dataset:  AAAD_MEDCON 
## Dataset dimension:  
## #obs:397, cols:256, inds:367 
## After adding visit index, dataset dimension:  
## #obs:397, cols:257, inds:367 
## ====================================================================== 
## 3 : Currently processing dataset:  AAAD_SOCIO_DEMO 
## Dataset dimension:  
## #obs:402, cols:161, inds:391 
## After adding visit index, dataset dimension:  
## #obs:402, cols:162, inds:391 
## ====================================================================== 
## 4 : Currently processing dataset:  AAAD_TRAILS 
## Dataset dimension:  
## #obs:439, cols:34, inds:428 
## After adding visit index, dataset dimension:  
## #obs:439, cols:35, inds:428 
## ====================================================================== 
## 5 : Currently processing dataset:  ALZ_B9_JUDGE_RC 
## Dataset dimension:  
## #obs:483, cols:82, inds:481 
## After adding visit index, dataset dimension:  
## #obs:483, cols:83, inds:481 
## ====================================================================== 
## 6 : Currently processing dataset:  ALZ_CSDD 
## Dataset dimension:  
## #obs:181, cols:42, inds:176 
## After adding visit index, dataset dimension:  
## #obs:181, cols:43, inds:176 
## ====================================================================== 
## 7 : Currently processing dataset:  ALZ_GAI_SP 
## Dataset dimension:  
## #obs:19, cols:42, inds:19 
## After adding visit index, dataset dimension:  
## #obs:19, cols:43, inds:19 
## ====================================================================== 
## 8 : Currently processing dataset:  ALZ_NEURO_CDR 
## Dataset dimension:  
## #obs:1221, cols:30, inds:1102 
## After adding visit index, dataset dimension:  
## #obs:1221, cols:31, inds:1102 
## ====================================================================== 
## 9 : Currently processing dataset:  ALZ_NPIQ_CBRS 
## Dataset dimension:  
## #obs:122, cols:116, inds:121 
## After adding visit index, dataset dimension:  
## #obs:122, cols:117, inds:121 
## ====================================================================== 
## 10 : Currently processing dataset:  ALZ_RPFQ 
## Dataset dimension:  
## #obs:132, cols:67, inds:132 
## After adding visit index, dataset dimension:  
## #obs:132, cols:68, inds:132 
## ====================================================================== 
## 11 : Currently processing dataset:  ALZ_SCREENING_RC 
## Dataset dimension:  
## #obs:556, cols:61, inds:552 
## After adding visit index, dataset dimension:  
## #obs:556, cols:62, inds:552 
## ====================================================================== 
## 12 : Currently processing dataset:  ALZ_STICK_D_RC 
## Dataset dimension:  
## #obs:430, cols:46, inds:428 
## After adding visit index, dataset dimension:  
## #obs:430, cols:47, inds:428 
## ====================================================================== 
## 13 : Currently processing dataset:  B4_CDR_RC 
## Dataset dimension:  
## #obs:599, cols:38, inds:592 
## After adding visit index, dataset dimension:  
## #obs:599, cols:39, inds:592 
## ====================================================================== 
## 14 : Currently processing dataset:  B5_NPIQ_RC 
## Dataset dimension:  
## #obs:305, cols:38, inds:304 
## After adding visit index, dataset dimension:  
## #obs:305, cols:39, inds:304 
## ====================================================================== 
## 15 : Currently processing dataset:  B6_GDS_RC 
## Dataset dimension:  
## #obs:543, cols:39, inds:539 
## After adding visit index, dataset dimension:  
## #obs:543, cols:40, inds:539 
## ====================================================================== 
## 16 : Currently processing dataset:  B7_FAS_RC 
## Dataset dimension:  
## #obs:435, cols:33, inds:431 
## After adding visit index, dataset dimension:  
## #obs:435, cols:34, inds:431 
## ====================================================================== 
## 17 : Currently processing dataset:  BCF_RECOG_RC 
## Dataset dimension:  
## #obs:266, cols:24, inds:266 
## After adding visit index, dataset dimension:  
## #obs:266, cols:25, inds:266 
## ====================================================================== 
## 18 : Currently processing dataset:  BCFCD_RC 
## Dataset dimension:  
## #obs:269, cols:38, inds:269 
## After adding visit index, dataset dimension:  
## #obs:269, cols:39, inds:269 
## ====================================================================== 
## 19 : Currently processing dataset:  BCFCI_RC 
## Dataset dimension:  
## #obs:270, cols:38, inds:270 
## After adding visit index, dataset dimension:  
## #obs:270, cols:39, inds:270 
## ====================================================================== 
## 20 : Currently processing dataset:  BILINGUAL_SCALE_RC 
## Dataset dimension:  
## #obs:240, cols:90, inds:240 
## After adding visit index, dataset dimension:  
## #obs:240, cols:91, inds:240 
## ====================================================================== 
## 21 : Currently processing dataset:  CAT_FLUENCY_RC 
## Dataset dimension:  
## #obs:555, cols:29, inds:550 
## After adding visit index, dataset dimension:  
## #obs:555, cols:30, inds:550 
## ====================================================================== 
## 22 : Currently processing dataset:  CERAD_DEL_RC 
## Dataset dimension:  
## #obs:177, cols:44, inds:177 
## After adding visit index, dataset dimension:  
## #obs:177, cols:45, inds:177 
## ====================================================================== 
## 23 : Currently processing dataset:  CERAD_IMM_RC 
## Dataset dimension:  
## #obs:188, cols:88, inds:188 
## After adding visit index, dataset dimension:  
## #obs:188, cols:89, inds:188 
## ====================================================================== 
## 24 : Currently processing dataset:  CERAD_RECOG_RC 
## Dataset dimension:  
## #obs:177, cols:48, inds:177 
## After adding visit index, dataset dimension:  
## #obs:177, cols:49, inds:177 
## ====================================================================== 
## 25 : Currently processing dataset:  CRAFT_21_DEL_RC 
## Dataset dimension:  
## #obs:523, cols:95, inds:519 
## After adding visit index, dataset dimension:  
## #obs:523, cols:96, inds:519 
## ====================================================================== 
## 26 : Currently processing dataset:  CRAFT_21_IMM_RC 
## Dataset dimension:  
## #obs:530, cols:98, inds:525 
## After adding visit index, dataset dimension:  
## #obs:530, cols:99, inds:525 
## ====================================================================== 
## 27 : Currently processing dataset:  MEDCON_RC 
## Dataset dimension:  
## #obs:627, cols:237, inds:618 
## After adding visit index, dataset dimension:  
## #obs:627, cols:238, inds:618 
## ====================================================================== 
## 28 : Currently processing dataset:  MEDICAL_HIST 
## Dataset dimension:  
## #obs:889, cols:53, inds:871 
## After adding visit index, dataset dimension:  
## #obs:889, cols:54, inds:871 
## ====================================================================== 
## 29 : Currently processing dataset:  MINT_RC 
## Dataset dimension:  
## #obs:3, cols:221, inds:3 
## After adding visit index, dataset dimension:  
## #obs:3, cols:222, inds:3 
## ====================================================================== 
## 30 : Currently processing dataset:  MINT_SP_RC 
## Dataset dimension:  
## #obs:303, cols:221, inds:301 
## After adding visit index, dataset dimension:  
## #obs:303, cols:222, inds:301 
## ====================================================================== 
## 31 : Currently processing dataset:  MOCA_RC 
## Dataset dimension:  
## #obs:585, cols:140, inds:580 
## After adding visit index, dataset dimension:  
## #obs:585, cols:141, inds:580 
## ====================================================================== 
## 32 : Currently processing dataset:  NUMBER_SPAN_RC 
## Dataset dimension:  
## #obs:527, cols:85, inds:522 
## After adding visit index, dataset dimension:  
## #obs:527, cols:86, inds:522
## Now add visit index to dataset with FORM_DATE
sorted_ppdat3 <- sorted_ppdat2
names(sorted_ppdat3)[names(sorted_ppdat3)=="EXAM_DATE"] <- "FORM_DATE"

for (df_name in dfwFORM_DATE) {
  cat("======================================================================","\n")
  df_obj <- get(df_name)  # get the dataframe
  
  cat(index,": Currently processing dataset: ",df_name,"\n")
  cat("Dataset dimension: ","\n")
  info(df_obj,"SYSIND")
  
  df_obj <- merge(df_obj,sorted_ppdat3,by=c("SYSIND","FORM_DATE"))
  
  cat("After adding visit index, dataset dimension: ","\n")
  info(df_obj,"SYSIND")
  
  ## change dataset name
  newdfname <- paste0("wVisitIndex_",df_name)
  assign(newdfname, df_obj)
  
  index = index + 1
}
## ====================================================================== 
## 33 : Currently processing dataset:  ALZ_CLINICALSUM 
## Dataset dimension:  
## #obs:1484, cols:39, inds:1480 
## After adding visit index, dataset dimension:  
## #obs:1484, cols:40, inds:1480 
## ====================================================================== 
## 34 : Currently processing dataset:  ALZ_EXAM 
## Dataset dimension:  
## #obs:526, cols:80, inds:522 
## After adding visit index, dataset dimension:  
## #obs:526, cols:81, inds:522 
## ====================================================================== 
## 35 : Currently processing dataset:  ALZ_NCRAD 
## Dataset dimension:  
## #obs:742, cols:53, inds:742 
## After adding visit index, dataset dimension:  
## #obs:742, cols:54, inds:742 
## ====================================================================== 
## 36 : Currently processing dataset:  ALZ_SCREENING 
## Dataset dimension:  
## #obs:279, cols:49, inds:272 
## After adding visit index, dataset dimension:  
## #obs:279, cols:50, inds:272
## Finally, we add visit index to the last two datasets: ALZ_LOAD_COG and CONSENSUS_DX

cat("======================================================================","\n")
## ======================================================================
df_obj <- ALZ_LOAD_COG  # get the dataframe
df_name <- "ALZ_LOAD_COG"
cat(index,": Currently processing dataset: ",df_name,"\n")
## 37 : Currently processing dataset:  ALZ_LOAD_COG
cat("Dataset dimension: ","\n")
## Dataset dimension:
info(df_obj,"SYSIND")
## #obs:1006, cols:41, inds:907
df_obj <- merge(df_obj,sorted_ppdat2,by.x=c("SYSIND","INTERVIEW_DATE"),by.y = c("SYSIND","EXAM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
## After adding visit index, dataset dimension:
info(df_obj,"SYSIND")
## #obs:1006, cols:42, inds:907
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)
index = index + 1

cat("======================================================================","\n")
## ======================================================================
df_obj <- CONSENSUS_DX  # get the dataframe
df_name <- "CONSENSUS_DX"
cat(index,": Currently processing dataset: ",df_name,"\n")
## 38 : Currently processing dataset:  CONSENSUS_DX
cat("Dataset dimension: ","\n")
## Dataset dimension:
info(df_obj,"SYSIND")
## #obs:1701, cols:59, inds:1584
df_obj <- merge(df_obj,sorted_ppdat2,by.x=c("SYSIND","DATE_DX"),by.y = c("SYSIND","EXAM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
## After adding visit index, dataset dimension:
info(df_obj,"SYSIND")
## #obs:1701, cols:60, inds:1584
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)


Duplicates Detection by SYSIND and Visit_Index

df_names <- ls(pattern = "^wVisitIndex_")
length(df_names)
## [1] 38
for (df_name in df_names) {
  df <- get(df_name)
  print(df %>% group_by(SYSIND,Visit_Index) %>% filter(n() > 1))
}
## # A tibble: 2 × 63
## # Groups:   SYSIND, Visit_Index [1]
##     SYSIND EXAM_DATE    SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##      <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
## 1 11221433 2019-10-28 7780783 7929443    1367553  7984703         1         1
## 2 11221433 2020-01-15 7820163 7929443    1367553  7984703         1         1
## # ℹ 55 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   SATISFIED_LIFE <chr>, DROPPED_ACTIVITIES <chr>, FEEL_EMPTY <chr>,
## #   GOOD_SPIRIT <chr>, AFRAID_BAD_THINGS <chr>, BORED <chr>, FEEL_HAPPY <chr>,
## #   FEEL_HELPLESS <chr>, STAY_HOME <chr>, MEMORY_PROBLEM <chr>, ALIVE <chr>,
## #   FEEL_WORTHLESS <chr>, FEEL_FULL_ENERGY <chr>, FEEL_HOPELESS <chr>, …
## # A tibble: 0 × 257
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 257 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MEMORY_COMPLAINTS <dbl>,
## #   DATE_OF_ONSET <date>, DOA_UNK <chr>, DESCRIBE <chr>, …
## # A tibble: 0 × 162
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 162 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, SDF1 <dbl>, SDF2 <dbl>, SDF2A <chr>,
## #   SDF3 <dbl>, SDF3A <chr>, SDF4 <chr>, SDF4A <chr>, SDF5 <chr>, …
## # A tibble: 0 × 35
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 35 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, TIME_A <dbl>, TIME_AMISS <dbl>,
## #   ERR_A <dbl>, ERR_AMISS <dbl>, COR_A <dbl>, COR_AMISS <dbl>, TIME_B <dbl>, …
## # A tibble: 0 × 83
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 83 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MEMORY_DECLINE <dbl>,
## #   COP_RPT_MEMDECLINE <dbl>, MEANINGFUL_IMP <dbl>, IMP_MEMORY <dbl>, …
## # A tibble: 2 × 40
## # Groups:   SYSIND, Visit_Index [1]
##     SYSIND FORM_DATE    SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##      <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
## 1 11220743 2020-05-20 7867513 7888893    1304233  7984013         1         1
## 2 11220743 2020-07-20 7655693 7888893    1304233  7984013         1         1
## # ℹ 32 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, DATE_OF_BIRTH <date>, LAST_CONTACT_DATE <date>,
## #   LAST_CONTACT_AGE <dbl>, AGE_OF_DEATH <dbl>, AGE_OF_EXAM <dbl>,
## #   IMPRESSION <chr>, AD_CATEGORY <chr>, AGE_OF_ONSET <dbl>,
## #   AOO_DOC_EST_UNK <chr>, AGE_OF_DIAGNOSIS <dbl>, AODX_UNKNOWN <chr>,
## #   AD_HX_CATEGORY <chr>, UNCLEAR_CATEGORY <chr>, DEMENT_NAME <chr>, …
## # A tibble: 2 × 43
## # Groups:   SYSIND, Visit_Index [1]
##     SYSIND EXAM_DATE    SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##      <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
## 1 11008433 2019-01-23 7627553 7888973    1304313  7763233         1         1
## 2 11008433 2019-02-22 7670553 7888973    1304313  7763233         1         1
## # ℹ 35 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   ANXIETY <dbl>, SADNESS <dbl>, LACK_REACTION <dbl>, IRRITABILITY <dbl>,
## #   AGITATION <dbl>, RETARDATION <dbl>, MULTI_COMPLAINTS <dbl>,
## #   LOSS_INTEREST <dbl>, LOSS_APPETITE <dbl>, LOSS_WEIGHT <dbl>,
## #   LACK_ENERGY <dbl>, DIURNAL_MOOD <dbl>, DIFF_ASLEEP <dbl>, …
## # A tibble: 0 × 81
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 81 variables: SYSIND <dbl>, FORM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   FILLED_OUT_BY <chr>, DATE_OF_BIRTH <date>, NEURO_METHOD <chr>,
## #   NEURO_EXAM_DATE <date>, NEURO_EXAMINER <chr>, MOOD_AFFECT <chr>,
## #   DEPRESSED <chr>, MANIC <chr>, MOOD_OTHER <chr>, MOOD_OTHER_DSC <chr>, …
## # A tibble: 0 × 43
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 43 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, WORRY_ALOT <dbl>,
## #   DIFF_MAKE_DECISION <dbl>, FEEL_JUMPY <dbl>, HARD_TO_RELAX <dbl>, …
## # A tibble: 0 × 42
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 42 variables: SYSIND <dbl>, INTERVIEW_DATE <date>, SYSXM <dbl>,
## #   SYSGP <dbl>, SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>,
## #   GPS_ORDER <dbl>, STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, INTERVIEWER <chr>, DATE_OF_BIRTH <date>, INTERVIEW_AGE <dbl>,
## #   VERSION <chr>, PHONE <dbl>, STORY <dbl>, DIGFOR <dbl>, DIGBAK <dbl>,
## #   ANIMALS <dbl>, FRUITS <dbl>, VEG <dbl>, DIGORD <dbl>, DELAY <dbl>, …
## # A tibble: 0 × 54
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 54 variables: SYSIND <dbl>, FORM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   QUALIFY <chr>, FILLED_OUT_BY <chr>, DATE_OF_BIRTH <date>, IN_NCRAD <chr>,
## #   SAMPLED <dbl>, EDUC <dbl>, VISIT <dbl>, COMREQ <dbl>, NOTDEMCI <dbl>,
## #   EVALMETH <dbl>, EVALYR <dbl>, CLDEMLEW <dbl>, COMDXAD <chr>, …
## # A tibble: 10 × 31
## # Groups:   SYSIND, Visit_Index [5]
##      SYSIND EXAM_DATE    SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##       <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
##  1 11005393 2019-02-05 7630723 7888563    1303903  7760193         1         1
##  2 11005393 2019-03-07 7659523 7888563    1303903  7760193         1         1
##  3 11008433 2019-01-23 7627483 7888973    1304313  7763233         1         1
##  4 11008433 2019-02-22 7670543 7888973    1304313  7763233         1         1
##  5 11009423 2019-10-11 7753863 7889193    1304533  7764243         1         1
##  6 11009423 2019-10-23 7764193 7889193    1304533  7764243         1         1
##  7 11161523 2022-03-02 8041863 7924113    1362223  7924633         1         1
##  8 11161523 2022-03-30 8081813 7924113    1362223  7924633         1         1
##  9 11221433 2019-10-28 7780773 7929443    1367553  7984703         1         1
## 10 11221433 2020-01-15 7820153 7929443    1367553  7984703         1         1
## # ℹ 23 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   METHOD <chr>, RECONSTRUCTED <chr>, CDR_TOTAL <dbl>, MEMORY <dbl>,
## #   ORIENTATION <dbl>, PROBLEM_SOLVE <dbl>, COM_AFFAIR <dbl>,
## #   HOME_HOBBIES <dbl>, PERSONAL_CARE <dbl>, CDR_COMM <chr>, Visit_Index <int>
## # A tibble: 0 × 117
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 117 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>, NPIQINF <chr>,
## #   NPIQINF_PRO <chr>, NPIQINF_OTH <chr>, NPIQINFA <dbl>, NPIQINFB <dbl>,
## #   NPIQTYPE <dbl>, AGIT <dbl>, AGITSEV <dbl>, AGITATION_DIST <dbl>, …
## # A tibble: 0 × 68
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 68 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, SMOKE <dbl>, SMOKE_AGE_START <dbl>,
## #   SMOKE_CURR <dbl>, SMOKE_AGE_STOP <dbl>, PREGNANCIES <dbl>, …
## # A tibble: 0 × 50
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 50 variables: SYSIND <dbl>, FORM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   FILLED_OUT_BY <chr>, DATE_OF_BIRTH <date>, LUMBAR_YES_NO <chr>,
## #   LUMBAR_DATE <date>, LUMBAR_NO_DATE <chr>, LUMBAR_PUNCTURE <chr>,
## #   BRAIN_MRI_YES_NO <chr>, BRAIN_MRI_DATE <date>, BRAIN_MRI_NO_DATE <chr>, …
## # A tibble: 0 × 62
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 62 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, LUMB_YN <chr>, LUMB_DT <date>,
## #   LUMB_PUNC <chr>, LUMB_NOTES <chr>, BRNMRI_YN <chr>, BRNMRI_DT <date>, …
## # A tibble: 0 × 47
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 47 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, DRSD_I <dbl>, DRSD_II <dbl>,
## #   DRSD_III <dbl>, DRSD_IV <dbl>, DRSD_V <dbl>, DRSD_VI <dbl>, …
## # A tibble: 0 × 39
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 39 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, METHOD_CDR <chr>, MEMO_NOTE <chr>,
## #   MEMO_SC <dbl>, ORIENT_NOTE <chr>, ORIENT_SC <dbl>, P_SOLVE_NOTE <chr>, …
## # A tibble: 0 × 39
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 39 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, NPIQINF <dbl>, NPIQINF_OTH <chr>,
## #   NPIQTYPE <dbl>, DELSEV <dbl>, HALLSEV <dbl>, AGITSEV <dbl>, …
## # A tibble: 0 × 40
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 40 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, LIFE <dbl>, ACTIVITY <dbl>,
## #   EMPTY <dbl>, BORED <dbl>, SPIRIT <dbl>, AFRAID <dbl>, HAPPY <dbl>, …
## # A tibble: 0 × 34
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 34 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, FAQ1 <dbl>, FAQ2 <dbl>, FAQ3 <dbl>,
## #   FAQ4 <dbl>, FAQ5 <dbl>, FAQ6 <dbl>, FAQ7 <dbl>, FAQ8 <dbl>, FAQ9 <dbl>, …
## # A tibble: 0 × 25
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 25 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, CBF_RECOGNIZE_STIMULUS <dbl>,
## #   COMMENTS_BCFRECOGN <chr>, Visit_Index <int>
## # A tibble: 0 × 39
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 39 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, FOURSIDED_DELAY <dbl>,
## #   STRAIGHT_LINES_DELAY <dbl>, MIDDLETHIRD_DELAY <dbl>, ROUND_DELAY <dbl>, …
## # A tibble: 0 × 39
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 39 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, FOURSIDED <dbl>, STRAIGHT_LINES <dbl>,
## #   MIDDLETHIRD <dbl>, ROUND <dbl>, VERTICAL_LINES <dbl>, BELOW3 <dbl>, …
## # A tibble: 0 × 91
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 91 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, BILING_YEAR_EDU <dbl>,
## #   BILING_LANG <chr>, BILING_OTHER_LANG <dbl>, BILINGUAL_LANG_YES1 <chr>, …
## # A tibble: 0 × 30
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 30 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, ANIM_ENTRY <chr>, ANIM_SCORE <dbl>,
## #   ANIM_STATUS <dbl>, VEG_ENTRY <chr>, VEG_SCORE <dbl>, VEG_STATUS <dbl>, …
## # A tibble: 0 × 45
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 45 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, WLM_CRTA <dbl>, WLM_CRTB <dbl>,
## #   WLM_CRTC <dbl>, WLM_CRTD <dbl>, WLM_CRTE <dbl>, WLM_CRTF <dbl>, …
## # A tibble: 0 × 89
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 89 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, CERAD_PRESENTATION <dbl>, WLM_1A <dbl>,
## #   WLM_1B <dbl>, WLM_1C <dbl>, WLM_1D <dbl>, WLM_1E <dbl>, WLM_1F <dbl>, …
## # A tibble: 0 × 49
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 49 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, WLRG_PRESENT <dbl>, WLRG_K <dbl>,
## #   WLRG_L <dbl>, WLRG_A <dbl>, WLRG_M <dbl>, WLRG_B <dbl>, WLRG_C <dbl>, …
## # A tibble: 14 × 60
## # Groups:   SYSIND, Visit_Index [7]
##      SYSIND DATE_DX      SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##       <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
##  1 11008753 2018-06-07 7575353 7888993    1304333  7763553         1         1
##  2 11008753 2018-07-18 7575353 7888993    1304333  7763553         1         1
##  3 11009263 2020-05-20 7867013 7889133    1304473  7764083         1         1
##  4 11009263 2020-05-21 7865833 7889133    1304473  7764083         1         1
##  5 11040003 2022-01-09 7825303 7896303    1311623  7795743         1         1
##  6 11040003 2022-01-09 8160893 7896303    1311623  7795743         1         1
##  7 11044283 2018-07-12 7584693 7894093    1309413  7800143         1         1
##  8 11044283 2018-10-02 7584693 7894093    1309413  7800143         1         1
##  9 11052753 2020-05-05 7859703 7897783    1313103  7808613         1         1
## 10 11052753 2020-05-20 7867483 7897783    1313103  7808613         1         1
## 11 11109763 2022-01-11 7827723 7921113    1359223  7869283         1         1
## 12 11109763 2022-03-24 8160793 7921113    1359223  7869283         1         1
## 13 11345993 2020-07-24 7877783 7943343    1384253  8115263         1         1
## 14 11345993 2020-07-28 7877783 7943343    1384253  8115263         1         1
## # ℹ 52 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, DATE_OF_BIRTH <date>, CLINICAL_COMMENTS <chr>,
## #   OTHER_TXT1 <chr>, OTHER_TXT2 <chr>, OTHER_TXT3 <chr>, CALC_VAL1 <dbl>,
## #   CALC_VAL2 <dbl>, CALC_VAL3 <dbl>, CALC_VAL4 <dbl>, CALC_VAL5 <dbl>,
## #   CALC_VAL6 <dbl>, CALC_VAL7 <dbl>, CALC_VAL8 <dbl>, CALC_VAL9 <dbl>,
## #   CALC_VAL10 <dbl>, CALC_VAL11 <dbl>, LAST_SOURCE <chr>, …
## # A tibble: 0 × 96
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 96 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, CRAFTDVR_ENTRY <chr>, CRAFTDTI <date>,
## #   CRAFTDVR1 <dbl>, CRAFTDVR2 <dbl>, CRAFTDVR3 <dbl>, CRAFTDVR4 <dbl>, …
## # A tibble: 0 × 99
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 99 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, CRAFTVRS_ENTRY <chr>,
## #   CRAFTVRS_TIME <dttm>, CRAFTVRS1 <dbl>, CRAFTVRS2 <dbl>, CRAFTVRS3 <dbl>, …
## # A tibble: 0 × 238
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 238 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MEMORY_COMPLAINTS <dbl>,
## #   DATE_OF_ONSET <date>, DOA_UNK <chr>, DESCRIBE <chr>, …
## # A tibble: 8 × 54
## # Groups:   SYSIND, Visit_Index [4]
##     SYSIND EXAM_DATE    SYSXM   SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER
##      <dbl> <date>       <dbl>   <dbl>      <dbl>    <dbl>     <dbl>     <dbl>
## 1 11008433 2019-01-23 7628443 7888973    1304313  7763233         1         1
## 2 11008433 2019-02-22 7670513 7888973    1304313  7763233         1         1
## 3 11008733 2018-03-16 7544683 7888993    1304333  7763533         1         1
## 4 11008733 2018-06-19 7578663 7888993    1304333  7763533         1         1
## 5 11039563 2018-01-08 7493593 7896073    1311393  7795303         1         1
## 6 11039563 2018-03-27 7553803 7896073    1311393  7795303         1         1
## 7 11221433 2019-10-28 7780793 7929443    1367553  7984703         1         1
## 8 11221433 2020-01-15 7820143 7929443    1367553  7984703         1         1
## # ℹ 46 more variables: STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>,
## #   STUDY <chr>, SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>,
## #   REFCTR <chr>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   XMSTUDY <chr>, RELATION <chr>, ANXIETY <chr>, ASTHMA <chr>, A_D_D <chr>,
## #   AUTISM <chr>, CANCER <chr>, CANCER_TYPE <chr>, DEPRESSION <chr>,
## #   DIABETES_TYPE1 <chr>, DIABETES_TYPE2 <chr>, DIABETES <chr>,
## #   LIPIDS_CHOL <chr>, EPILEPSY <chr>, GASTRIC_ULCERS <chr>, …
## # A tibble: 0 × 222
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 222 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MINT1A <dbl>, MINT1B <dbl>,
## #   MINT1C <dbl>, MINT1D <dbl>, MINT1F <dbl>, BUTTERFLY_OTHER <chr>, …
## # A tibble: 0 × 222
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 222 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MINT1A_SP <dbl>, MINT1B_SP <dbl>,
## #   MINT1C_SP <dbl>, MINT1D_SP <dbl>, MINT1F_SP <dbl>, TAMBOR_OTHER_SP <chr>, …
## # A tibble: 0 × 141
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 141 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, MOCALOC <dbl>, MOCALOC_OTHER <chr>,
## #   MOCALAN <dbl>, MOCALANX <chr>, MOCATRAI <dbl>, MOCACUBE <dbl>, …
## # A tibble: 0 × 86
## # Groups:   SYSIND, Visit_Index [0]
## # ℹ 86 variables: SYSIND <dbl>, EXAM_DATE <date>, SYSXM <dbl>, SYSGP <dbl>,
## #   SYSGPSTUDY <dbl>, SYSINDGP <dbl>, CGI_ORDER <dbl>, GPS_ORDER <dbl>,
## #   STDCGI_ORDER <dbl>, LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## #   SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## #   EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## #   REVIEW_DATE <date>, REVIEWER <chr>, SPF3_R1 <chr>, SPF3_1 <dbl>,
## #   SPF3_R2 <chr>, SPF3_2 <dbl>, SPF4_R1 <chr>, SPF4_1 <dbl>, SPF4_R2 <chr>, …